{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/ja-en.do02.ado01/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:26147', 'distributed_port': 26147, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/ja-en.do02.ado01', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/ja-en.do02.ado01/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/ja-en.do02.ado01', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/ja-en/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.2, attention_dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/ja-en/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=16000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=12.123, nll_loss=11.814, ppl=3601.26, wps=550779, ups=1.11, wpb=495063, bsz=16556.9, num_updates=100, lr=2.5e-05, gnorm=2.504, clip=77, loss_scale=4, train_wall=94, gb_free=21, wall=113 epoch 001: 201 / 1689 loss=10.552, nll_loss=10.011, ppl=1031.66, wps=554154, ups=1.12, wpb=494772, bsz=16958.6, num_updates=200, lr=5e-05, gnorm=1.86, clip=94, loss_scale=4, train_wall=89, gb_free=21.1, wall=202 epoch 001: 301 / 1689 loss=9.886, nll_loss=9.221, ppl=596.67, wps=554872, ups=1.12, wpb=496328, bsz=16644.9, num_updates=300, lr=7.5e-05, gnorm=2.054, clip=100, loss_scale=4, train_wall=89, gb_free=21.7, wall=291 epoch 001: 401 / 1689 loss=9.28, nll_loss=8.505, ppl=363.23, wps=549218, ups=1.11, wpb=495021, bsz=16565.7, num_updates=400, lr=0.0001, gnorm=1.831, clip=100, loss_scale=4, train_wall=90, gb_free=21.7, wall=381 epoch 001: 501 / 1689 loss=8.833, nll_loss=7.975, ppl=251.64, wps=548663, ups=1.11, wpb=495038, bsz=16610.6, num_updates=500, lr=0.000125, gnorm=1.659, clip=99, loss_scale=4, train_wall=90, gb_free=21.5, wall=472 epoch 001: 601 / 1689 loss=8.465, nll_loss=7.542, ppl=186.34, wps=545926, ups=1.1, wpb=495492, bsz=16496.7, num_updates=600, lr=0.00015, gnorm=1.525, clip=100, loss_scale=8, train_wall=90, gb_free=17.5, wall=562 epoch 001: 702 / 1689 loss=8.109, nll_loss=7.126, ppl=139.69, wps=540666, ups=1.09, wpb=494758, bsz=16304.4, num_updates=700, lr=0.000175, gnorm=1.395, clip=100, loss_scale=4, train_wall=90, gb_free=21.5, wall=654 epoch 001: 802 / 1689 loss=7.743, nll_loss=6.702, ppl=104.12, wps=545464, ups=1.1, wpb=496202, bsz=16400.6, num_updates=800, lr=0.0002, gnorm=1.291, clip=95, loss_scale=4, train_wall=90, gb_free=21.2, wall=745 epoch 001: 902 / 1689 loss=7.392, nll_loss=6.295, ppl=78.52, wps=551664, ups=1.11, wpb=496562, bsz=16617.4, num_updates=900, lr=0.000225, gnorm=1.164, clip=79, loss_scale=4, train_wall=89, gb_free=22, wall=835 epoch 001: 1002 / 1689 loss=7.087, nll_loss=5.942, ppl=61.49, wps=548932, ups=1.11, wpb=496189, bsz=16704.8, num_updates=1000, lr=0.00025, gnorm=1.085, clip=64, loss_scale=4, train_wall=90, gb_free=21.4, wall=925 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 6.852 | nll_loss 5.618 | ppl 49.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 1000 epoch 001: 1102 / 1689 loss=6.758, nll_loss=5.564, ppl=47.32, wps=464109, ups=0.94, wpb=494438, bsz=16503.4, num_updates=1100, lr=0.000275, gnorm=1.007, clip=50, loss_scale=4, train_wall=89, gb_free=19.6, wall=1032 epoch 001: 1202 / 1689 loss=6.443, nll_loss=5.202, ppl=36.8, wps=556724, ups=1.13, wpb=494853, bsz=16237.2, num_updates=1200, lr=0.0003, gnorm=0.96, clip=43, loss_scale=8, train_wall=88, gb_free=21.9, wall=1121 epoch 001: 1302 / 1689 loss=6.13, nll_loss=4.845, ppl=28.73, wps=549620, ups=1.11, wpb=493588, bsz=16373.8, num_updates=1300, lr=0.000325, gnorm=0.94, clip=34, loss_scale=8, train_wall=89, gb_free=21, wall=1211 epoch 001: 1402 / 1689 loss=5.828, nll_loss=4.5, ppl=22.63, wps=549992, ups=1.11, wpb=497384, bsz=16279.9, num_updates=1400, lr=0.00035, gnorm=0.83, clip=21, loss_scale=8, train_wall=89, gb_free=22.2, wall=1301 epoch 001: 1502 / 1689 loss=5.589, nll_loss=4.23, ppl=18.76, wps=544282, ups=1.1, wpb=494548, bsz=16575.5, num_updates=1500, lr=0.000375, gnorm=0.771, clip=10, loss_scale=8, train_wall=90, gb_free=21.9, wall=1392 epoch 001: 1602 / 1689 loss=5.392, nll_loss=4.008, ppl=16.09, wps=547047, ups=1.1, wpb=495810, bsz=16558.1, num_updates=1600, lr=0.0004, gnorm=0.711, clip=10, loss_scale=8, train_wall=89, gb_free=21.6, wall=1483 end of epoch 1 (average epoch stats below) epoch 001 | loss 7.72 | nll_loss 6.692 | ppl 103.42 | wps 542820 | ups 1.1 | wpb 495106 | bsz 16502.4 | num_updates 1686 | lr 0.0004215 | gnorm 1.314 | clip 64 | loss_scale 8 | train_wall 1512 | gb_free 20.9 | wall 1560 Start iterating over samples epoch 002: 14 / 1689 loss=5.254, nll_loss=3.855, ppl=14.47, wps=535486, ups=1.09, wpb=490634, bsz=16234.5, num_updates=1700, lr=0.000425, gnorm=0.646, clip=3, loss_scale=8, train_wall=90, gb_free=21.6, wall=1574 epoch 002: 14 / 1689 loss=5.254, nll_loss=3.855, ppl=14.47, wps=535486, ups=1.09, wpb=490634, bsz=16234.5, num_updates=1700, lr=0.000425, gnorm=0.646, clip=3, loss_scale=8, train_wall=90, gb_free=21.6, wall=1574 epoch 002: 114 / 1689 loss=5.114, nll_loss=3.699, ppl=12.98, wps=549896, ups=1.11, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.594, clip=2, loss_scale=8, train_wall=89, gb_free=21.5, wall=1664 epoch 002: 114 / 1689 loss=5.114, nll_loss=3.699, ppl=12.98, wps=549896, ups=1.11, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.594, clip=2, loss_scale=8, train_wall=89, gb_free=21.5, wall=1664 epoch 002: 215 / 1689 loss=5.021, nll_loss=3.596, ppl=12.09, wps=541549, ups=1.09, wpb=494699, bsz=16515.4, num_updates=1900, lr=0.000475, gnorm=0.575, clip=2, loss_scale=4, train_wall=90, gb_free=22.3, wall=1756 epoch 002: 215 / 1689 loss=5.021, nll_loss=3.596, ppl=12.09, wps=541549, ups=1.09, wpb=494699, bsz=16515.4, num_updates=1900, lr=0.000475, gnorm=0.575, clip=2, loss_scale=4, train_wall=90, gb_free=22.3, wall=1756 epoch 002: 315 / 1689 loss=4.922, nll_loss=3.488, ppl=11.22, wps=544968, ups=1.1, wpb=494152, bsz=16807.8, num_updates=2000, lr=0.0005, gnorm=0.519, clip=2, loss_scale=4, train_wall=89, gb_free=21.7, wall=1846 epoch 002: 315 / 1689 loss=4.922, nll_loss=3.488, ppl=11.22, wps=544968, ups=1.1, wpb=494152, bsz=16807.8, num_updates=2000, lr=0.0005, gnorm=0.519, clip=2, loss_scale=4, train_wall=89, gb_free=21.7, wall=1846 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.885 | nll_loss 3.373 | ppl 10.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 4.885 epoch 002 | valid on 'valid' subset | loss 4.885 | nll_loss 3.373 | ppl 10.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 4.885 epoch 002: 415 / 1689 loss=4.834, nll_loss=3.391, ppl=10.49, wps=457158, ups=0.92, wpb=495100, bsz=16294.4, num_updates=2100, lr=0.000525, gnorm=0.522, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=1955 epoch 002: 415 / 1689 loss=4.834, nll_loss=3.391, ppl=10.49, wps=457158, ups=0.92, wpb=495100, bsz=16294.4, num_updates=2100, lr=0.000525, gnorm=0.522, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=1955 epoch 002: 515 / 1689 loss=4.764, nll_loss=3.314, ppl=9.94, wps=545696, ups=1.1, wpb=494927, bsz=16567.2, num_updates=2200, lr=0.00055, gnorm=0.475, clip=1, loss_scale=4, train_wall=89, gb_free=21.9, wall=2045 epoch 002: 515 / 1689 loss=4.764, nll_loss=3.314, ppl=9.94, wps=545696, ups=1.1, wpb=494927, bsz=16567.2, num_updates=2200, lr=0.00055, gnorm=0.475, clip=1, loss_scale=4, train_wall=89, gb_free=21.9, wall=2045 epoch 002: 615 / 1689 loss=4.684, nll_loss=3.227, ppl=9.37, wps=548868, ups=1.11, wpb=495521, bsz=16630.4, num_updates=2300, lr=0.000575, gnorm=0.447, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=2136 epoch 002: 615 / 1689 loss=4.684, nll_loss=3.227, ppl=9.37, wps=548868, ups=1.11, wpb=495521, bsz=16630.4, num_updates=2300, lr=0.000575, gnorm=0.447, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=2136 epoch 002: 716 / 1689 loss=4.653, nll_loss=3.194, ppl=9.15, wps=538408, ups=1.09, wpb=495283, bsz=16119.6, num_updates=2400, lr=0.0006, gnorm=0.455, clip=6, loss_scale=4, train_wall=90, gb_free=21.6, wall=2228 epoch 002: 716 / 1689 loss=4.653, nll_loss=3.194, ppl=9.15, wps=538408, ups=1.09, wpb=495283, bsz=16119.6, num_updates=2400, lr=0.0006, gnorm=0.455, clip=6, loss_scale=4, train_wall=90, gb_free=21.6, wall=2228 epoch 002: 816 / 1689 loss=4.583, nll_loss=3.118, ppl=8.68, wps=542715, ups=1.1, wpb=494615, bsz=16786.2, num_updates=2500, lr=0.000625, gnorm=0.452, clip=0, loss_scale=4, train_wall=89, gb_free=20.6, wall=2319 epoch 002: 816 / 1689 loss=4.583, nll_loss=3.118, ppl=8.68, wps=542715, ups=1.1, wpb=494615, bsz=16786.2, num_updates=2500, lr=0.000625, gnorm=0.452, clip=0, loss_scale=4, train_wall=89, gb_free=20.6, wall=2319 epoch 002: 916 / 1689 loss=4.524, nll_loss=3.054, ppl=8.3, wps=547411, ups=1.1, wpb=495471, bsz=16597.7, num_updates=2600, lr=0.00065, gnorm=0.402, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=2409 epoch 002: 916 / 1689 loss=4.524, nll_loss=3.054, ppl=8.3, wps=547411, ups=1.1, wpb=495471, bsz=16597.7, num_updates=2600, lr=0.00065, gnorm=0.402, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=2409 epoch 002: 1016 / 1689 loss=4.493, nll_loss=3.02, ppl=8.11, wps=547634, ups=1.11, wpb=495174, bsz=16381.9, num_updates=2700, lr=0.000675, gnorm=0.43, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=2500 epoch 002: 1016 / 1689 loss=4.493, nll_loss=3.02, ppl=8.11, wps=547634, ups=1.11, wpb=495174, bsz=16381.9, num_updates=2700, lr=0.000675, gnorm=0.43, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=2500 epoch 002: 1116 / 1689 loss=4.457, nll_loss=2.98, ppl=7.89, wps=547046, ups=1.11, wpb=494962, bsz=16373.1, num_updates=2800, lr=0.0007, gnorm=0.39, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=2590 epoch 002: 1116 / 1689 loss=4.457, nll_loss=2.98, ppl=7.89, wps=547046, ups=1.11, wpb=494962, bsz=16373.1, num_updates=2800, lr=0.0007, gnorm=0.39, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=2590 epoch 002: 1217 / 1689 loss=4.413, nll_loss=2.934, ppl=7.64, wps=540919, ups=1.09, wpb=496138, bsz=16444.8, num_updates=2900, lr=0.000725, gnorm=0.382, clip=0, loss_scale=4, train_wall=90, gb_free=21.9, wall=2682 epoch 002: 1217 / 1689 loss=4.413, nll_loss=2.934, ppl=7.64, wps=540919, ups=1.09, wpb=496138, bsz=16444.8, num_updates=2900, lr=0.000725, gnorm=0.382, clip=0, loss_scale=4, train_wall=90, gb_free=21.9, wall=2682 epoch 002: 1317 / 1689 loss=4.389, nll_loss=2.908, ppl=7.51, wps=547325, ups=1.1, wpb=496192, bsz=16657.4, num_updates=3000, lr=0.00075, gnorm=0.395, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=2773 epoch 002: 1317 / 1689 loss=4.389, nll_loss=2.908, ppl=7.51, wps=547325, ups=1.1, wpb=496192, bsz=16657.4, num_updates=3000, lr=0.00075, gnorm=0.395, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=2773 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.349 | nll_loss 2.78 | ppl 6.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.349 epoch 002 | valid on 'valid' subset | loss 4.349 | nll_loss 2.78 | ppl 6.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.349 epoch 002: 1417 / 1689 loss=4.355, nll_loss=2.871, ppl=7.32, wps=449587, ups=0.91, wpb=494634, bsz=16333.1, num_updates=3100, lr=0.000775, gnorm=0.379, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=2883 epoch 002: 1417 / 1689 loss=4.355, nll_loss=2.871, ppl=7.32, wps=449587, ups=0.91, wpb=494634, bsz=16333.1, num_updates=3100, lr=0.000775, gnorm=0.379, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=2883 epoch 002: 1517 / 1689 loss=4.332, nll_loss=2.847, ppl=7.19, wps=549781, ups=1.11, wpb=495455, bsz=16419.5, num_updates=3200, lr=0.0008, gnorm=0.381, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=2973 epoch 002: 1517 / 1689 loss=4.332, nll_loss=2.847, ppl=7.19, wps=549781, ups=1.11, wpb=495455, bsz=16419.5, num_updates=3200, lr=0.0008, gnorm=0.381, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=2973 epoch 002: 1617 / 1689 loss=4.309, nll_loss=2.822, ppl=7.07, wps=549704, ups=1.11, wpb=497280, bsz=16313, num_updates=3300, lr=0.000825, gnorm=0.376, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3063 epoch 002: 1617 / 1689 loss=4.309, nll_loss=2.822, ppl=7.07, wps=549704, ups=1.11, wpb=497280, bsz=16313, num_updates=3300, lr=0.000825, gnorm=0.376, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3063 end of epoch 2 (average epoch stats below) epoch 002 | loss 4.606 | nll_loss 3.144 | ppl 8.84 | wps 532442 | ups 1.08 | wpb 495120 | bsz 16502.6 | num_updates 3372 | lr 0.000843 | gnorm 0.446 | clip 0.8 | loss_scale 4 | train_wall 1501 | gb_free 25.6 | wall 3128 epoch 002 | loss 4.606 | nll_loss 3.144 | ppl 8.84 | wps 532442 | ups 1.08 | wpb 495120 | bsz 16502.6 | num_updates 3372 | lr 0.000843 | gnorm 0.446 | clip 0.8 | loss_scale 4 | train_wall 1501 | gb_free 25.6 | wall 3128 Start iterating over samples epoch 003: 28 / 1689 loss=4.298, nll_loss=2.81, ppl=7.01, wps=538730, ups=1.1, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.389, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=3154 epoch 003: 28 / 1689 loss=4.298, nll_loss=2.81, ppl=7.01, wps=538730, ups=1.1, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.389, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=3154 epoch 003: 28 / 1689 loss=4.298, nll_loss=2.81, ppl=7.01, wps=538730, ups=1.1, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.389, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=3154 epoch 003: 129 / 1689 loss=4.255, nll_loss=2.763, ppl=6.79, wps=540394, ups=1.09, wpb=495005, bsz=16802.5, num_updates=3500, lr=0.000875, gnorm=0.345, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=3246 epoch 003: 129 / 1689 loss=4.255, nll_loss=2.763, ppl=6.79, wps=540394, ups=1.09, wpb=495005, bsz=16802.5, num_updates=3500, lr=0.000875, gnorm=0.345, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=3246 epoch 003: 129 / 1689 loss=4.255, nll_loss=2.763, ppl=6.79, wps=540394, ups=1.09, wpb=495005, bsz=16802.5, num_updates=3500, lr=0.000875, gnorm=0.345, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=3246 epoch 003: 229 / 1689 loss=4.249, nll_loss=2.757, ppl=6.76, wps=550454, ups=1.11, wpb=495540, bsz=16848.8, num_updates=3600, lr=0.0009, gnorm=0.368, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3336 epoch 003: 229 / 1689 loss=4.249, nll_loss=2.757, ppl=6.76, wps=550454, ups=1.11, wpb=495540, bsz=16848.8, num_updates=3600, lr=0.0009, gnorm=0.368, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3336 epoch 003: 229 / 1689 loss=4.249, nll_loss=2.757, ppl=6.76, wps=550454, ups=1.11, wpb=495540, bsz=16848.8, num_updates=3600, lr=0.0009, gnorm=0.368, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3336 epoch 003: 329 / 1689 loss=4.238, nll_loss=2.746, ppl=6.71, wps=547664, ups=1.11, wpb=494780, bsz=16521.4, num_updates=3700, lr=0.000925, gnorm=0.365, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3426 epoch 003: 329 / 1689 loss=4.238, nll_loss=2.746, ppl=6.71, wps=547664, ups=1.11, wpb=494780, bsz=16521.4, num_updates=3700, lr=0.000925, gnorm=0.365, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3426 epoch 003: 329 / 1689 loss=4.238, nll_loss=2.746, ppl=6.71, wps=547664, ups=1.11, wpb=494780, bsz=16521.4, num_updates=3700, lr=0.000925, gnorm=0.365, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3426 epoch 003: 429 / 1689 loss=4.231, nll_loss=2.738, ppl=6.67, wps=549489, ups=1.11, wpb=494717, bsz=16453.2, num_updates=3800, lr=0.00095, gnorm=0.373, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=3516 epoch 003: 429 / 1689 loss=4.231, nll_loss=2.738, ppl=6.67, wps=549489, ups=1.11, wpb=494717, bsz=16453.2, num_updates=3800, lr=0.00095, gnorm=0.373, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=3516 epoch 003: 429 / 1689 loss=4.231, nll_loss=2.738, ppl=6.67, wps=549489, ups=1.11, wpb=494717, bsz=16453.2, num_updates=3800, lr=0.00095, gnorm=0.373, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=3516 epoch 003: 529 / 1689 loss=4.21, nll_loss=2.715, ppl=6.57, wps=553604, ups=1.12, wpb=496084, bsz=16501, num_updates=3900, lr=0.000975, gnorm=0.361, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=3606 epoch 003: 529 / 1689 loss=4.21, nll_loss=2.715, ppl=6.57, wps=553604, ups=1.12, wpb=496084, bsz=16501, num_updates=3900, lr=0.000975, gnorm=0.361, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=3606 epoch 003: 529 / 1689 loss=4.21, nll_loss=2.715, ppl=6.57, wps=553604, ups=1.12, wpb=496084, bsz=16501, num_updates=3900, lr=0.000975, gnorm=0.361, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=3606 epoch 003: 630 / 1689 loss=4.198, nll_loss=2.702, ppl=6.51, wps=546266, ups=1.1, wpb=495719, bsz=16476.2, num_updates=4000, lr=0.001, gnorm=0.348, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=3697 epoch 003: 630 / 1689 loss=4.198, nll_loss=2.702, ppl=6.51, wps=546266, ups=1.1, wpb=495719, bsz=16476.2, num_updates=4000, lr=0.001, gnorm=0.348, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=3697 epoch 003: 630 / 1689 loss=4.198, nll_loss=2.702, ppl=6.51, wps=546266, ups=1.1, wpb=495719, bsz=16476.2, num_updates=4000, lr=0.001, gnorm=0.348, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=3697 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.197 | nll_loss 2.642 | ppl 6.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.197 epoch 003 | valid on 'valid' subset | loss 4.197 | nll_loss 2.642 | ppl 6.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.197 epoch 003 | valid on 'valid' subset | loss 4.197 | nll_loss 2.642 | ppl 6.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.197 epoch 003: 730 / 1689 loss=4.195, nll_loss=2.7, ppl=6.5, wps=455178, ups=0.92, wpb=496107, bsz=16321.5, num_updates=4100, lr=0.00098773, gnorm=0.359, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=3806 epoch 003: 730 / 1689 loss=4.195, nll_loss=2.7, ppl=6.5, wps=455178, ups=0.92, wpb=496107, bsz=16321.5, num_updates=4100, lr=0.00098773, gnorm=0.359, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=3806 epoch 003: 730 / 1689 loss=4.195, nll_loss=2.7, ppl=6.5, wps=455178, ups=0.92, wpb=496107, bsz=16321.5, num_updates=4100, lr=0.00098773, gnorm=0.359, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=3806 epoch 003: 830 / 1689 loss=4.179, nll_loss=2.683, ppl=6.42, wps=551797, ups=1.12, wpb=493256, bsz=16294.1, num_updates=4200, lr=0.0009759, gnorm=0.344, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=3895 epoch 003: 830 / 1689 loss=4.179, nll_loss=2.683, ppl=6.42, wps=551797, ups=1.12, wpb=493256, bsz=16294.1, num_updates=4200, lr=0.0009759, gnorm=0.344, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=3895 epoch 003: 830 / 1689 loss=4.179, nll_loss=2.683, ppl=6.42, wps=551797, ups=1.12, wpb=493256, bsz=16294.1, num_updates=4200, lr=0.0009759, gnorm=0.344, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=3895 epoch 003: 930 / 1689 loss=4.167, nll_loss=2.67, ppl=6.36, wps=553389, ups=1.12, wpb=494080, bsz=16401.6, num_updates=4300, lr=0.000964486, gnorm=0.335, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=3984 epoch 003: 930 / 1689 loss=4.167, nll_loss=2.67, ppl=6.36, wps=553389, ups=1.12, wpb=494080, bsz=16401.6, num_updates=4300, lr=0.000964486, gnorm=0.335, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=3984 epoch 003: 930 / 1689 loss=4.167, nll_loss=2.67, ppl=6.36, wps=553389, ups=1.12, wpb=494080, bsz=16401.6, num_updates=4300, lr=0.000964486, gnorm=0.335, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=3984 epoch 003: 1030 / 1689 loss=4.158, nll_loss=2.66, ppl=6.32, wps=552864, ups=1.11, wpb=496673, bsz=16402.4, num_updates=4400, lr=0.000953463, gnorm=0.326, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4074 epoch 003: 1030 / 1689 loss=4.158, nll_loss=2.66, ppl=6.32, wps=552864, ups=1.11, wpb=496673, bsz=16402.4, num_updates=4400, lr=0.000953463, gnorm=0.326, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4074 epoch 003: 1030 / 1689 loss=4.158, nll_loss=2.66, ppl=6.32, wps=552864, ups=1.11, wpb=496673, bsz=16402.4, num_updates=4400, lr=0.000953463, gnorm=0.326, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4074 epoch 003: 1130 / 1689 loss=4.136, nll_loss=2.637, ppl=6.22, wps=552010, ups=1.11, wpb=496084, bsz=16964.2, num_updates=4500, lr=0.000942809, gnorm=0.331, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4164 epoch 003: 1130 / 1689 loss=4.136, nll_loss=2.637, ppl=6.22, wps=552010, ups=1.11, wpb=496084, bsz=16964.2, num_updates=4500, lr=0.000942809, gnorm=0.331, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4164 epoch 003: 1130 / 1689 loss=4.136, nll_loss=2.637, ppl=6.22, wps=552010, ups=1.11, wpb=496084, bsz=16964.2, num_updates=4500, lr=0.000942809, gnorm=0.331, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4164 epoch 003: 1231 / 1689 loss=4.129, nll_loss=2.63, ppl=6.19, wps=543932, ups=1.1, wpb=494064, bsz=16407.5, num_updates=4600, lr=0.000932505, gnorm=0.321, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4255 epoch 003: 1231 / 1689 loss=4.129, nll_loss=2.63, ppl=6.19, wps=543932, ups=1.1, wpb=494064, bsz=16407.5, num_updates=4600, lr=0.000932505, gnorm=0.321, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4255 epoch 003: 1231 / 1689 loss=4.129, nll_loss=2.63, ppl=6.19, wps=543932, ups=1.1, wpb=494064, bsz=16407.5, num_updates=4600, lr=0.000932505, gnorm=0.321, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4255 epoch 003: 1331 / 1689 loss=4.119, nll_loss=2.619, ppl=6.14, wps=552175, ups=1.11, wpb=495774, bsz=16363.3, num_updates=4700, lr=0.000922531, gnorm=0.32, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=4345 epoch 003: 1331 / 1689 loss=4.119, nll_loss=2.619, ppl=6.14, wps=552175, ups=1.11, wpb=495774, bsz=16363.3, num_updates=4700, lr=0.000922531, gnorm=0.32, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=4345 epoch 003: 1331 / 1689 loss=4.119, nll_loss=2.619, ppl=6.14, wps=552175, ups=1.11, wpb=495774, bsz=16363.3, num_updates=4700, lr=0.000922531, gnorm=0.32, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=4345 epoch 003: 1431 / 1689 loss=4.105, nll_loss=2.603, ppl=6.08, wps=549550, ups=1.11, wpb=496353, bsz=16691.4, num_updates=4800, lr=0.000912871, gnorm=0.307, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=4435 epoch 003: 1431 / 1689 loss=4.105, nll_loss=2.603, ppl=6.08, wps=549550, ups=1.11, wpb=496353, bsz=16691.4, num_updates=4800, lr=0.000912871, gnorm=0.307, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=4435 epoch 003: 1431 / 1689 loss=4.105, nll_loss=2.603, ppl=6.08, wps=549550, ups=1.11, wpb=496353, bsz=16691.4, num_updates=4800, lr=0.000912871, gnorm=0.307, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=4435 epoch 003: 1531 / 1689 loss=4.093, nll_loss=2.591, ppl=6.03, wps=551068, ups=1.11, wpb=495920, bsz=16447.6, num_updates=4900, lr=0.000903508, gnorm=0.31, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=4525 epoch 003: 1531 / 1689 loss=4.093, nll_loss=2.591, ppl=6.03, wps=551068, ups=1.11, wpb=495920, bsz=16447.6, num_updates=4900, lr=0.000903508, gnorm=0.31, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=4525 epoch 003: 1531 / 1689 loss=4.093, nll_loss=2.591, ppl=6.03, wps=551068, ups=1.11, wpb=495920, bsz=16447.6, num_updates=4900, lr=0.000903508, gnorm=0.31, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=4525 epoch 003: 1631 / 1689 loss=4.09, nll_loss=2.587, ppl=6.01, wps=552922, ups=1.12, wpb=495532, bsz=16098.1, num_updates=5000, lr=0.000894427, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4615 epoch 003: 1631 / 1689 loss=4.09, nll_loss=2.587, ppl=6.01, wps=552922, ups=1.12, wpb=495532, bsz=16098.1, num_updates=5000, lr=0.000894427, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4615 epoch 003: 1631 / 1689 loss=4.09, nll_loss=2.587, ppl=6.01, wps=552922, ups=1.12, wpb=495532, bsz=16098.1, num_updates=5000, lr=0.000894427, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4615 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.053 | nll_loss 2.502 | ppl 5.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.053 epoch 003 | valid on 'valid' subset | loss 4.053 | nll_loss 2.502 | ppl 5.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.053 epoch 003 | valid on 'valid' subset | loss 4.053 | nll_loss 2.502 | ppl 5.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.053 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.171 | nll_loss 2.674 | ppl 6.38 | wps 532554 | ups 1.08 | wpb 495118 | bsz 16503.3 | num_updates 5058 | lr 0.000889284 | gnorm 0.339 | clip 0 | loss_scale 4 | train_wall 1491 | gb_free 23.2 | wall 4696 epoch 003 | loss 4.171 | nll_loss 2.674 | ppl 6.38 | wps 532554 | ups 1.08 | wpb 495118 | bsz 16503.3 | num_updates 5058 | lr 0.000889284 | gnorm 0.339 | clip 0 | loss_scale 4 | train_wall 1491 | gb_free 23.2 | wall 4696 epoch 003 | loss 4.171 | nll_loss 2.674 | ppl 6.38 | wps 532554 | ups 1.08 | wpb 495118 | bsz 16503.3 | num_updates 5058 | lr 0.000889284 | gnorm 0.339 | clip 0 | loss_scale 4 | train_wall 1491 | gb_free 23.2 | wall 4696 Start iterating over samples epoch 004: 42 / 1689 loss=4.066, nll_loss=2.561, ppl=5.9, wps=410572, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.302, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4734 epoch 004: 42 / 1689 loss=4.066, nll_loss=2.561, ppl=5.9, wps=410572, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.302, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4734 epoch 004: 42 / 1689 loss=4.066, nll_loss=2.561, ppl=5.9, wps=410572, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.302, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4734 epoch 004: 42 / 1689 loss=4.066, nll_loss=2.561, ppl=5.9, wps=410572, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.302, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=4734 epoch 004: 143 / 1689 loss=4.055, nll_loss=2.548, ppl=5.85, wps=556544, ups=1.12, wpb=495968, bsz=16362.6, num_updates=5200, lr=0.000877058, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4824 epoch 004: 143 / 1689 loss=4.055, nll_loss=2.548, ppl=5.85, wps=556544, ups=1.12, wpb=495968, bsz=16362.6, num_updates=5200, lr=0.000877058, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4824 epoch 004: 143 / 1689 loss=4.055, nll_loss=2.548, ppl=5.85, wps=556544, ups=1.12, wpb=495968, bsz=16362.6, num_updates=5200, lr=0.000877058, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4824 epoch 004: 143 / 1689 loss=4.055, nll_loss=2.548, ppl=5.85, wps=556544, ups=1.12, wpb=495968, bsz=16362.6, num_updates=5200, lr=0.000877058, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4824 epoch 004: 244 / 1689 loss=4.045, nll_loss=2.538, ppl=5.81, wps=544107, ups=1.1, wpb=496239, bsz=16376.7, num_updates=5300, lr=0.000868744, gnorm=0.292, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=4915 epoch 004: 244 / 1689 loss=4.045, nll_loss=2.538, ppl=5.81, wps=544107, ups=1.1, wpb=496239, bsz=16376.7, num_updates=5300, lr=0.000868744, gnorm=0.292, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=4915 epoch 004: 244 / 1689 loss=4.045, nll_loss=2.538, ppl=5.81, wps=544107, ups=1.1, wpb=496239, bsz=16376.7, num_updates=5300, lr=0.000868744, gnorm=0.292, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=4915 epoch 004: 244 / 1689 loss=4.045, nll_loss=2.538, ppl=5.81, wps=544107, ups=1.1, wpb=496239, bsz=16376.7, num_updates=5300, lr=0.000868744, gnorm=0.292, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=4915 epoch 004: 344 / 1689 loss=4.039, nll_loss=2.532, ppl=5.78, wps=549011, ups=1.11, wpb=493916, bsz=16712.4, num_updates=5400, lr=0.000860663, gnorm=0.301, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=5005 epoch 004: 344 / 1689 loss=4.039, nll_loss=2.532, ppl=5.78, wps=549011, ups=1.11, wpb=493916, bsz=16712.4, num_updates=5400, lr=0.000860663, gnorm=0.301, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=5005 epoch 004: 344 / 1689 loss=4.039, nll_loss=2.532, ppl=5.78, wps=549011, ups=1.11, wpb=493916, bsz=16712.4, num_updates=5400, lr=0.000860663, gnorm=0.301, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=5005 epoch 004: 344 / 1689 loss=4.039, nll_loss=2.532, ppl=5.78, wps=549011, ups=1.11, wpb=493916, bsz=16712.4, num_updates=5400, lr=0.000860663, gnorm=0.301, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=5005 epoch 004: 444 / 1689 loss=4.026, nll_loss=2.518, ppl=5.73, wps=555631, ups=1.12, wpb=497168, bsz=16659.1, num_updates=5500, lr=0.000852803, gnorm=0.289, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=5094 epoch 004: 444 / 1689 loss=4.026, nll_loss=2.518, ppl=5.73, wps=555631, ups=1.12, wpb=497168, bsz=16659.1, num_updates=5500, lr=0.000852803, gnorm=0.289, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=5094 epoch 004: 444 / 1689 loss=4.026, nll_loss=2.518, ppl=5.73, wps=555631, ups=1.12, wpb=497168, bsz=16659.1, num_updates=5500, lr=0.000852803, gnorm=0.289, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=5094 epoch 004: 444 / 1689 loss=4.026, nll_loss=2.518, ppl=5.73, wps=555631, ups=1.12, wpb=497168, bsz=16659.1, num_updates=5500, lr=0.000852803, gnorm=0.289, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=5094 epoch 004: 544 / 1689 loss=4.038, nll_loss=2.532, ppl=5.78, wps=554169, ups=1.12, wpb=495796, bsz=16480.4, num_updates=5600, lr=0.000845154, gnorm=0.281, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=5184 epoch 004: 544 / 1689 loss=4.038, nll_loss=2.532, ppl=5.78, wps=554169, ups=1.12, wpb=495796, bsz=16480.4, num_updates=5600, lr=0.000845154, gnorm=0.281, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=5184 epoch 004: 544 / 1689 loss=4.038, nll_loss=2.532, ppl=5.78, wps=554169, ups=1.12, wpb=495796, bsz=16480.4, num_updates=5600, lr=0.000845154, gnorm=0.281, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=5184 epoch 004: 544 / 1689 loss=4.038, nll_loss=2.532, ppl=5.78, wps=554169, ups=1.12, wpb=495796, bsz=16480.4, num_updates=5600, lr=0.000845154, gnorm=0.281, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=5184 epoch 004: 644 / 1689 loss=4.021, nll_loss=2.513, ppl=5.71, wps=548393, ups=1.11, wpb=494228, bsz=16562.7, num_updates=5700, lr=0.000837708, gnorm=0.292, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5274 epoch 004: 644 / 1689 loss=4.021, nll_loss=2.513, ppl=5.71, wps=548393, ups=1.11, wpb=494228, bsz=16562.7, num_updates=5700, lr=0.000837708, gnorm=0.292, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5274 epoch 004: 644 / 1689 loss=4.021, nll_loss=2.513, ppl=5.71, wps=548393, ups=1.11, wpb=494228, bsz=16562.7, num_updates=5700, lr=0.000837708, gnorm=0.292, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5274 epoch 004: 644 / 1689 loss=4.021, nll_loss=2.513, ppl=5.71, wps=548393, ups=1.11, wpb=494228, bsz=16562.7, num_updates=5700, lr=0.000837708, gnorm=0.292, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5274 epoch 004: 744 / 1689 loss=4.017, nll_loss=2.509, ppl=5.69, wps=549126, ups=1.11, wpb=495809, bsz=16491.7, num_updates=5800, lr=0.000830455, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5364 epoch 004: 744 / 1689 loss=4.017, nll_loss=2.509, ppl=5.69, wps=549126, ups=1.11, wpb=495809, bsz=16491.7, num_updates=5800, lr=0.000830455, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5364 epoch 004: 744 / 1689 loss=4.017, nll_loss=2.509, ppl=5.69, wps=549126, ups=1.11, wpb=495809, bsz=16491.7, num_updates=5800, lr=0.000830455, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5364 epoch 004: 744 / 1689 loss=4.017, nll_loss=2.509, ppl=5.69, wps=549126, ups=1.11, wpb=495809, bsz=16491.7, num_updates=5800, lr=0.000830455, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5364 epoch 004: 844 / 1689 loss=4.022, nll_loss=2.514, ppl=5.71, wps=545439, ups=1.1, wpb=495241, bsz=16259.4, num_updates=5900, lr=0.000823387, gnorm=0.278, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5455 epoch 004: 844 / 1689 loss=4.022, nll_loss=2.514, ppl=5.71, wps=545439, ups=1.1, wpb=495241, bsz=16259.4, num_updates=5900, lr=0.000823387, gnorm=0.278, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5455 epoch 004: 844 / 1689 loss=4.022, nll_loss=2.514, ppl=5.71, wps=545439, ups=1.1, wpb=495241, bsz=16259.4, num_updates=5900, lr=0.000823387, gnorm=0.278, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5455 epoch 004: 844 / 1689 loss=4.022, nll_loss=2.514, ppl=5.71, wps=545439, ups=1.1, wpb=495241, bsz=16259.4, num_updates=5900, lr=0.000823387, gnorm=0.278, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5455 epoch 004: 944 / 1689 loss=4.003, nll_loss=2.493, ppl=5.63, wps=553164, ups=1.12, wpb=495799, bsz=16359.4, num_updates=6000, lr=0.000816497, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5545 epoch 004: 944 / 1689 loss=4.003, nll_loss=2.493, ppl=5.63, wps=553164, ups=1.12, wpb=495799, bsz=16359.4, num_updates=6000, lr=0.000816497, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5545 epoch 004: 944 / 1689 loss=4.003, nll_loss=2.493, ppl=5.63, wps=553164, ups=1.12, wpb=495799, bsz=16359.4, num_updates=6000, lr=0.000816497, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5545 epoch 004: 944 / 1689 loss=4.003, nll_loss=2.493, ppl=5.63, wps=553164, ups=1.12, wpb=495799, bsz=16359.4, num_updates=6000, lr=0.000816497, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5545 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 3.984 | nll_loss 2.427 | ppl 5.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.984 epoch 004 | valid on 'valid' subset | loss 3.984 | nll_loss 2.427 | ppl 5.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.984 epoch 004 | valid on 'valid' subset | loss 3.984 | nll_loss 2.427 | ppl 5.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.984 epoch 004 | valid on 'valid' subset | loss 3.984 | nll_loss 2.427 | ppl 5.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.984 epoch 004: 1044 / 1689 loss=3.997, nll_loss=2.487, ppl=5.61, wps=454814, ups=0.92, wpb=495086, bsz=16673.1, num_updates=6100, lr=0.000809776, gnorm=0.278, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5653 epoch 004: 1044 / 1689 loss=3.997, nll_loss=2.487, ppl=5.61, wps=454814, ups=0.92, wpb=495086, bsz=16673.1, num_updates=6100, lr=0.000809776, gnorm=0.278, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5653 epoch 004: 1044 / 1689 loss=3.997, nll_loss=2.487, ppl=5.61, wps=454814, ups=0.92, wpb=495086, bsz=16673.1, num_updates=6100, lr=0.000809776, gnorm=0.278, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5653 epoch 004: 1044 / 1689 loss=3.997, nll_loss=2.487, ppl=5.61, wps=454814, ups=0.92, wpb=495086, bsz=16673.1, num_updates=6100, lr=0.000809776, gnorm=0.278, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5653 epoch 004: 1144 / 1689 loss=3.998, nll_loss=2.489, ppl=5.62, wps=551373, ups=1.11, wpb=494915, bsz=16669.1, num_updates=6200, lr=0.000803219, gnorm=0.266, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=5743 epoch 004: 1144 / 1689 loss=3.998, nll_loss=2.489, ppl=5.62, wps=551373, ups=1.11, wpb=494915, bsz=16669.1, num_updates=6200, lr=0.000803219, gnorm=0.266, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=5743 epoch 004: 1144 / 1689 loss=3.998, nll_loss=2.489, ppl=5.62, wps=551373, ups=1.11, wpb=494915, bsz=16669.1, num_updates=6200, lr=0.000803219, gnorm=0.266, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=5743 epoch 004: 1144 / 1689 loss=3.998, nll_loss=2.489, ppl=5.62, wps=551373, ups=1.11, wpb=494915, bsz=16669.1, num_updates=6200, lr=0.000803219, gnorm=0.266, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=5743 epoch 004: 1244 / 1689 loss=3.99, nll_loss=2.48, ppl=5.58, wps=547028, ups=1.1, wpb=495126, bsz=16272.9, num_updates=6300, lr=0.000796819, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.3, wall=5834 epoch 004: 1244 / 1689 loss=3.99, nll_loss=2.48, ppl=5.58, wps=547028, ups=1.1, wpb=495126, bsz=16272.9, num_updates=6300, lr=0.000796819, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.3, wall=5834 epoch 004: 1244 / 1689 loss=3.99, nll_loss=2.48, ppl=5.58, wps=547028, ups=1.1, wpb=495126, bsz=16272.9, num_updates=6300, lr=0.000796819, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.3, wall=5834 epoch 004: 1244 / 1689 loss=3.99, nll_loss=2.48, ppl=5.58, wps=547028, ups=1.1, wpb=495126, bsz=16272.9, num_updates=6300, lr=0.000796819, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.3, wall=5834 epoch 004: 1345 / 1689 loss=3.989, nll_loss=2.48, ppl=5.58, wps=544092, ups=1.1, wpb=494175, bsz=16742.5, num_updates=6400, lr=0.000790569, gnorm=0.274, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=5924 epoch 004: 1345 / 1689 loss=3.989, nll_loss=2.48, ppl=5.58, wps=544092, ups=1.1, wpb=494175, bsz=16742.5, num_updates=6400, lr=0.000790569, gnorm=0.274, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=5924 epoch 004: 1345 / 1689 loss=3.989, nll_loss=2.48, ppl=5.58, wps=544092, ups=1.1, wpb=494175, bsz=16742.5, num_updates=6400, lr=0.000790569, gnorm=0.274, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=5924 epoch 004: 1345 / 1689 loss=3.989, nll_loss=2.48, ppl=5.58, wps=544092, ups=1.1, wpb=494175, bsz=16742.5, num_updates=6400, lr=0.000790569, gnorm=0.274, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=5924 epoch 004: 1445 / 1689 loss=3.98, nll_loss=2.47, ppl=5.54, wps=551107, ups=1.11, wpb=494419, bsz=16771.3, num_updates=6500, lr=0.000784465, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=6014 epoch 004: 1445 / 1689 loss=3.98, nll_loss=2.47, ppl=5.54, wps=551107, ups=1.11, wpb=494419, bsz=16771.3, num_updates=6500, lr=0.000784465, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=6014 epoch 004: 1445 / 1689 loss=3.98, nll_loss=2.47, ppl=5.54, wps=551107, ups=1.11, wpb=494419, bsz=16771.3, num_updates=6500, lr=0.000784465, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=6014 epoch 004: 1445 / 1689 loss=3.98, nll_loss=2.47, ppl=5.54, wps=551107, ups=1.11, wpb=494419, bsz=16771.3, num_updates=6500, lr=0.000784465, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=6014 epoch 004: 1545 / 1689 loss=3.977, nll_loss=2.466, ppl=5.53, wps=547039, ups=1.11, wpb=494629, bsz=16409.5, num_updates=6600, lr=0.000778499, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=6105 epoch 004: 1545 / 1689 loss=3.977, nll_loss=2.466, ppl=5.53, wps=547039, ups=1.11, wpb=494629, bsz=16409.5, num_updates=6600, lr=0.000778499, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=6105 epoch 004: 1545 / 1689 loss=3.977, nll_loss=2.466, ppl=5.53, wps=547039, ups=1.11, wpb=494629, bsz=16409.5, num_updates=6600, lr=0.000778499, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=6105 epoch 004: 1545 / 1689 loss=3.977, nll_loss=2.466, ppl=5.53, wps=547039, ups=1.11, wpb=494629, bsz=16409.5, num_updates=6600, lr=0.000778499, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=6105 epoch 004: 1645 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=549734, ups=1.11, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=6195 epoch 004: 1645 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=549734, ups=1.11, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=6195 epoch 004: 1645 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=549734, ups=1.11, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=6195 epoch 004: 1645 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=549734, ups=1.11, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=6195 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.01 | nll_loss 2.502 | ppl 5.66 | wps 542714 | ups 1.1 | wpb 495111 | bsz 16506.8 | num_updates 6744 | lr 0.000770143 | gnorm 0.281 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.3 | wall 6234 epoch 004 | loss 4.01 | nll_loss 2.502 | ppl 5.66 | wps 542714 | ups 1.1 | wpb 495111 | bsz 16506.8 | num_updates 6744 | lr 0.000770143 | gnorm 0.281 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.3 | wall 6234 epoch 004 | loss 4.01 | nll_loss 2.502 | ppl 5.66 | wps 542714 | ups 1.1 | wpb 495111 | bsz 16506.8 | num_updates 6744 | lr 0.000770143 | gnorm 0.281 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.3 | wall 6234 epoch 004 | loss 4.01 | nll_loss 2.502 | ppl 5.66 | wps 542714 | ups 1.1 | wpb 495111 | bsz 16506.8 | num_updates 6744 | lr 0.000770143 | gnorm 0.281 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.3 | wall 6234 Start iterating over samples epoch 005: 56 / 1689 loss=3.956, nll_loss=2.443, ppl=5.44, wps=545136, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.268, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=6285 epoch 005: 56 / 1689 loss=3.956, nll_loss=2.443, ppl=5.44, wps=545136, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.268, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=6285 epoch 005: 56 / 1689 loss=3.956, nll_loss=2.443, ppl=5.44, wps=545136, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.268, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=6285 epoch 005: 56 / 1689 loss=3.956, nll_loss=2.443, ppl=5.44, wps=545136, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.268, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=6285 epoch 005: 56 / 1689 loss=3.956, nll_loss=2.443, ppl=5.44, wps=545136, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.268, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=6285 epoch 005: 158 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=542206, ups=1.09, wpb=495179, bsz=16439.5, num_updates=6900, lr=0.000761387, gnorm=0.269, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=6376 epoch 005: 158 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=542206, ups=1.09, wpb=495179, bsz=16439.5, num_updates=6900, lr=0.000761387, gnorm=0.269, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=6376 epoch 005: 158 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=542206, ups=1.09, wpb=495179, bsz=16439.5, num_updates=6900, lr=0.000761387, gnorm=0.269, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=6376 epoch 005: 158 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=542206, ups=1.09, wpb=495179, bsz=16439.5, num_updates=6900, lr=0.000761387, gnorm=0.269, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=6376 epoch 005: 158 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=542206, ups=1.09, wpb=495179, bsz=16439.5, num_updates=6900, lr=0.000761387, gnorm=0.269, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=6376 epoch 005: 258 / 1689 loss=3.932, nll_loss=2.416, ppl=5.34, wps=548455, ups=1.11, wpb=494494, bsz=16414.6, num_updates=7000, lr=0.000755929, gnorm=0.266, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=6467 epoch 005: 258 / 1689 loss=3.932, nll_loss=2.416, ppl=5.34, wps=548455, ups=1.11, wpb=494494, bsz=16414.6, num_updates=7000, lr=0.000755929, gnorm=0.266, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=6467 epoch 005: 258 / 1689 loss=3.932, nll_loss=2.416, ppl=5.34, wps=548455, ups=1.11, wpb=494494, bsz=16414.6, num_updates=7000, lr=0.000755929, gnorm=0.266, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=6467 epoch 005: 258 / 1689 loss=3.932, nll_loss=2.416, ppl=5.34, wps=548455, ups=1.11, wpb=494494, bsz=16414.6, num_updates=7000, lr=0.000755929, gnorm=0.266, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=6467 epoch 005: 258 / 1689 loss=3.932, nll_loss=2.416, ppl=5.34, wps=548455, ups=1.11, wpb=494494, bsz=16414.6, num_updates=7000, lr=0.000755929, gnorm=0.266, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=6467 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.932 | nll_loss 2.372 | ppl 5.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.932 epoch 005 | valid on 'valid' subset | loss 3.932 | nll_loss 2.372 | ppl 5.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.932 epoch 005 | valid on 'valid' subset | loss 3.932 | nll_loss 2.372 | ppl 5.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.932 epoch 005 | valid on 'valid' subset | loss 3.932 | nll_loss 2.372 | ppl 5.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.932 epoch 005 | valid on 'valid' subset | loss 3.932 | nll_loss 2.372 | ppl 5.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.932 epoch 005: 358 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=455484, ups=0.92, wpb=495153, bsz=16467.1, num_updates=7100, lr=0.000750587, gnorm=0.268, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=6575 epoch 005: 358 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=455484, ups=0.92, wpb=495153, bsz=16467.1, num_updates=7100, lr=0.000750587, gnorm=0.268, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=6575 epoch 005: 358 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=455484, ups=0.92, wpb=495153, bsz=16467.1, num_updates=7100, lr=0.000750587, gnorm=0.268, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=6575 epoch 005: 358 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=455484, ups=0.92, wpb=495153, bsz=16467.1, num_updates=7100, lr=0.000750587, gnorm=0.268, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=6575 epoch 005: 358 / 1689 loss=3.942, nll_loss=2.427, ppl=5.38, wps=455484, ups=0.92, wpb=495153, bsz=16467.1, num_updates=7100, lr=0.000750587, gnorm=0.268, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=6575 epoch 005: 458 / 1689 loss=3.931, nll_loss=2.416, ppl=5.34, wps=552744, ups=1.11, wpb=496292, bsz=16675, num_updates=7200, lr=0.000745356, gnorm=0.253, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=6665 epoch 005: 458 / 1689 loss=3.931, nll_loss=2.416, ppl=5.34, wps=552744, ups=1.11, wpb=496292, bsz=16675, num_updates=7200, lr=0.000745356, gnorm=0.253, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=6665 epoch 005: 458 / 1689 loss=3.931, nll_loss=2.416, ppl=5.34, wps=552744, ups=1.11, wpb=496292, bsz=16675, num_updates=7200, lr=0.000745356, gnorm=0.253, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=6665 epoch 005: 458 / 1689 loss=3.931, nll_loss=2.416, ppl=5.34, wps=552744, ups=1.11, wpb=496292, bsz=16675, num_updates=7200, lr=0.000745356, gnorm=0.253, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=6665 epoch 005: 458 / 1689 loss=3.931, nll_loss=2.416, ppl=5.34, wps=552744, ups=1.11, wpb=496292, bsz=16675, num_updates=7200, lr=0.000745356, gnorm=0.253, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=6665 epoch 005: 558 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=551139, ups=1.11, wpb=495647, bsz=16574.5, num_updates=7300, lr=0.000740233, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=6755 epoch 005: 558 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=551139, ups=1.11, wpb=495647, bsz=16574.5, num_updates=7300, lr=0.000740233, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=6755 epoch 005: 558 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=551139, ups=1.11, wpb=495647, bsz=16574.5, num_updates=7300, lr=0.000740233, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=6755 epoch 005: 558 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=551139, ups=1.11, wpb=495647, bsz=16574.5, num_updates=7300, lr=0.000740233, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=6755 epoch 005: 558 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=551139, ups=1.11, wpb=495647, bsz=16574.5, num_updates=7300, lr=0.000740233, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=6755 epoch 005: 658 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=549568, ups=1.11, wpb=495861, bsz=16415.4, num_updates=7400, lr=0.000735215, gnorm=0.262, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6845 epoch 005: 658 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=549568, ups=1.11, wpb=495861, bsz=16415.4, num_updates=7400, lr=0.000735215, gnorm=0.262, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6845 epoch 005: 658 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=549568, ups=1.11, wpb=495861, bsz=16415.4, num_updates=7400, lr=0.000735215, gnorm=0.262, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6845 epoch 005: 658 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=549568, ups=1.11, wpb=495861, bsz=16415.4, num_updates=7400, lr=0.000735215, gnorm=0.262, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6845 epoch 005: 658 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=549568, ups=1.11, wpb=495861, bsz=16415.4, num_updates=7400, lr=0.000735215, gnorm=0.262, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6845 epoch 005: 758 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=551738, ups=1.12, wpb=494330, bsz=16375.4, num_updates=7500, lr=0.000730297, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6935 epoch 005: 758 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=551738, ups=1.12, wpb=494330, bsz=16375.4, num_updates=7500, lr=0.000730297, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6935 epoch 005: 758 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=551738, ups=1.12, wpb=494330, bsz=16375.4, num_updates=7500, lr=0.000730297, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6935 epoch 005: 758 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=551738, ups=1.12, wpb=494330, bsz=16375.4, num_updates=7500, lr=0.000730297, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6935 epoch 005: 758 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=551738, ups=1.12, wpb=494330, bsz=16375.4, num_updates=7500, lr=0.000730297, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6935 epoch 005: 858 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=550180, ups=1.11, wpb=495347, bsz=16572.5, num_updates=7600, lr=0.000725476, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7025 epoch 005: 858 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=550180, ups=1.11, wpb=495347, bsz=16572.5, num_updates=7600, lr=0.000725476, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7025 epoch 005: 858 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=550180, ups=1.11, wpb=495347, bsz=16572.5, num_updates=7600, lr=0.000725476, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7025 epoch 005: 858 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=550180, ups=1.11, wpb=495347, bsz=16572.5, num_updates=7600, lr=0.000725476, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7025 epoch 005: 858 / 1689 loss=3.927, nll_loss=2.412, ppl=5.32, wps=550180, ups=1.11, wpb=495347, bsz=16572.5, num_updates=7600, lr=0.000725476, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7025 epoch 005: 958 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=555821, ups=1.12, wpb=496101, bsz=16480.4, num_updates=7700, lr=0.00072075, gnorm=0.261, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=7114 epoch 005: 958 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=555821, ups=1.12, wpb=496101, bsz=16480.4, num_updates=7700, lr=0.00072075, gnorm=0.261, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=7114 epoch 005: 958 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=555821, ups=1.12, wpb=496101, bsz=16480.4, num_updates=7700, lr=0.00072075, gnorm=0.261, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=7114 epoch 005: 958 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=555821, ups=1.12, wpb=496101, bsz=16480.4, num_updates=7700, lr=0.00072075, gnorm=0.261, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=7114 epoch 005: 958 / 1689 loss=3.933, nll_loss=2.419, ppl=5.35, wps=555821, ups=1.12, wpb=496101, bsz=16480.4, num_updates=7700, lr=0.00072075, gnorm=0.261, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=7114 epoch 005: 1058 / 1689 loss=3.917, nll_loss=2.402, ppl=5.29, wps=554530, ups=1.11, wpb=497554, bsz=16681, num_updates=7800, lr=0.000716115, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=7204 epoch 005: 1058 / 1689 loss=3.917, nll_loss=2.402, ppl=5.29, wps=554530, ups=1.11, wpb=497554, bsz=16681, num_updates=7800, lr=0.000716115, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=7204 epoch 005: 1058 / 1689 loss=3.917, nll_loss=2.402, ppl=5.29, wps=554530, ups=1.11, wpb=497554, bsz=16681, num_updates=7800, lr=0.000716115, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=7204 epoch 005: 1058 / 1689 loss=3.917, nll_loss=2.402, ppl=5.29, wps=554530, ups=1.11, wpb=497554, bsz=16681, num_updates=7800, lr=0.000716115, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=7204 epoch 005: 1058 / 1689 loss=3.917, nll_loss=2.402, ppl=5.29, wps=554530, ups=1.11, wpb=497554, bsz=16681, num_updates=7800, lr=0.000716115, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=7204 epoch 005: 1158 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547770, ups=1.11, wpb=494002, bsz=16666.2, num_updates=7900, lr=0.000711568, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=7294 epoch 005: 1158 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547770, ups=1.11, wpb=494002, bsz=16666.2, num_updates=7900, lr=0.000711568, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=7294 epoch 005: 1158 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547770, ups=1.11, wpb=494002, bsz=16666.2, num_updates=7900, lr=0.000711568, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=7294 epoch 005: 1158 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547770, ups=1.11, wpb=494002, bsz=16666.2, num_updates=7900, lr=0.000711568, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=7294 epoch 005: 1158 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547770, ups=1.11, wpb=494002, bsz=16666.2, num_updates=7900, lr=0.000711568, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=7294 epoch 005: 1260 / 1689 loss=3.915, nll_loss=2.4, ppl=5.28, wps=537331, ups=1.09, wpb=493867, bsz=16429.8, num_updates=8000, lr=0.000707107, gnorm=0.256, clip=0, loss_scale=2, train_wall=91, gb_free=22, wall=7386 epoch 005: 1260 / 1689 loss=3.915, nll_loss=2.4, ppl=5.28, wps=537331, ups=1.09, wpb=493867, bsz=16429.8, num_updates=8000, lr=0.000707107, gnorm=0.256, clip=0, loss_scale=2, train_wall=91, gb_free=22, wall=7386 epoch 005: 1260 / 1689 loss=3.915, nll_loss=2.4, ppl=5.28, wps=537331, ups=1.09, wpb=493867, bsz=16429.8, num_updates=8000, lr=0.000707107, gnorm=0.256, clip=0, loss_scale=2, train_wall=91, gb_free=22, wall=7386 epoch 005: 1260 / 1689 loss=3.915, nll_loss=2.4, ppl=5.28, wps=537331, ups=1.09, wpb=493867, bsz=16429.8, num_updates=8000, lr=0.000707107, gnorm=0.256, clip=0, loss_scale=2, train_wall=91, gb_free=22, wall=7386 epoch 005: 1260 / 1689 loss=3.915, nll_loss=2.4, ppl=5.28, wps=537331, ups=1.09, wpb=493867, bsz=16429.8, num_updates=8000, lr=0.000707107, gnorm=0.256, clip=0, loss_scale=2, train_wall=91, gb_free=22, wall=7386 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.897 | nll_loss 2.331 | ppl 5.03 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.897 epoch 005 | valid on 'valid' subset | loss 3.897 | nll_loss 2.331 | ppl 5.03 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.897 epoch 005 | valid on 'valid' subset | loss 3.897 | nll_loss 2.331 | ppl 5.03 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.897 epoch 005 | valid on 'valid' subset | loss 3.897 | nll_loss 2.331 | ppl 5.03 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.897 epoch 005 | valid on 'valid' subset | loss 3.897 | nll_loss 2.331 | ppl 5.03 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.897 epoch 005: 1360 / 1689 loss=3.912, nll_loss=2.397, ppl=5.27, wps=456425, ups=0.92, wpb=495196, bsz=16770.3, num_updates=8100, lr=0.000702728, gnorm=0.264, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=7494 epoch 005: 1360 / 1689 loss=3.912, nll_loss=2.397, ppl=5.27, wps=456425, ups=0.92, wpb=495196, bsz=16770.3, num_updates=8100, lr=0.000702728, gnorm=0.264, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=7494 epoch 005: 1360 / 1689 loss=3.912, nll_loss=2.397, ppl=5.27, wps=456425, ups=0.92, wpb=495196, bsz=16770.3, num_updates=8100, lr=0.000702728, gnorm=0.264, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=7494 epoch 005: 1360 / 1689 loss=3.912, nll_loss=2.397, ppl=5.27, wps=456425, ups=0.92, wpb=495196, bsz=16770.3, num_updates=8100, lr=0.000702728, gnorm=0.264, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=7494 epoch 005: 1360 / 1689 loss=3.912, nll_loss=2.397, ppl=5.27, wps=456425, ups=0.92, wpb=495196, bsz=16770.3, num_updates=8100, lr=0.000702728, gnorm=0.264, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=7494 epoch 005: 1460 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=551064, ups=1.11, wpb=495782, bsz=16427.8, num_updates=8200, lr=0.00069843, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=7584 epoch 005: 1460 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=551064, ups=1.11, wpb=495782, bsz=16427.8, num_updates=8200, lr=0.00069843, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=7584 epoch 005: 1460 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=551064, ups=1.11, wpb=495782, bsz=16427.8, num_updates=8200, lr=0.00069843, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=7584 epoch 005: 1460 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=551064, ups=1.11, wpb=495782, bsz=16427.8, num_updates=8200, lr=0.00069843, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=7584 epoch 005: 1460 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=551064, ups=1.11, wpb=495782, bsz=16427.8, num_updates=8200, lr=0.00069843, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=7584 epoch 005: 1560 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=550905, ups=1.11, wpb=495694, bsz=16150.2, num_updates=8300, lr=0.00069421, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=7674 epoch 005: 1560 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=550905, ups=1.11, wpb=495694, bsz=16150.2, num_updates=8300, lr=0.00069421, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=7674 epoch 005: 1560 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=550905, ups=1.11, wpb=495694, bsz=16150.2, num_updates=8300, lr=0.00069421, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=7674 epoch 005: 1560 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=550905, ups=1.11, wpb=495694, bsz=16150.2, num_updates=8300, lr=0.00069421, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=7674 epoch 005: 1560 / 1689 loss=3.906, nll_loss=2.39, ppl=5.24, wps=550905, ups=1.11, wpb=495694, bsz=16150.2, num_updates=8300, lr=0.00069421, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=7674 epoch 005: 1660 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=546530, ups=1.1, wpb=495495, bsz=16827.4, num_updates=8400, lr=0.000690066, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=7765 epoch 005: 1660 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=546530, ups=1.1, wpb=495495, bsz=16827.4, num_updates=8400, lr=0.000690066, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=7765 epoch 005: 1660 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=546530, ups=1.1, wpb=495495, bsz=16827.4, num_updates=8400, lr=0.000690066, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=7765 epoch 005: 1660 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=546530, ups=1.1, wpb=495495, bsz=16827.4, num_updates=8400, lr=0.000690066, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=7765 epoch 005: 1660 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=546530, ups=1.1, wpb=495495, bsz=16827.4, num_updates=8400, lr=0.000690066, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=7765 end of epoch 5 (average epoch stats below) epoch 005 | loss 3.924 | nll_loss 2.408 | ppl 5.31 | wps 535954 | ups 1.08 | wpb 495122 | bsz 16504.2 | num_updates 8429 | lr 0.000688877 | gnorm 0.259 | clip 0 | loss_scale 2 | train_wall 1491 | gb_free 22.3 | wall 7791 epoch 005 | loss 3.924 | nll_loss 2.408 | ppl 5.31 | wps 535954 | ups 1.08 | wpb 495122 | bsz 16504.2 | num_updates 8429 | lr 0.000688877 | gnorm 0.259 | clip 0 | loss_scale 2 | train_wall 1491 | gb_free 22.3 | wall 7791 epoch 005 | loss 3.924 | nll_loss 2.408 | ppl 5.31 | wps 535954 | ups 1.08 | wpb 495122 | bsz 16504.2 | num_updates 8429 | lr 0.000688877 | gnorm 0.259 | clip 0 | loss_scale 2 | train_wall 1491 | gb_free 22.3 | wall 7791 epoch 005 | loss 3.924 | nll_loss 2.408 | ppl 5.31 | wps 535954 | ups 1.08 | wpb 495122 | bsz 16504.2 | num_updates 8429 | lr 0.000688877 | gnorm 0.259 | clip 0 | loss_scale 2 | train_wall 1491 | gb_free 22.3 | wall 7791 epoch 005 | loss 3.924 | nll_loss 2.408 | ppl 5.31 | wps 535954 | ups 1.08 | wpb 495122 | bsz 16504.2 | num_updates 8429 | lr 0.000688877 | gnorm 0.259 | clip 0 | loss_scale 2 | train_wall 1491 | gb_free 22.3 | wall 7791 Start iterating over samples epoch 006: 71 / 1689 loss=3.877, nll_loss=2.357, ppl=5.12, wps=548039, ups=1.11, wpb=492909, bsz=16035, num_updates=8500, lr=0.000685994, gnorm=0.246, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=7855 epoch 006: 71 / 1689 loss=3.877, nll_loss=2.357, ppl=5.12, wps=548039, ups=1.11, wpb=492909, bsz=16035, num_updates=8500, lr=0.000685994, gnorm=0.246, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=7855 epoch 006: 71 / 1689 loss=3.877, nll_loss=2.357, ppl=5.12, wps=548039, ups=1.11, wpb=492909, bsz=16035, num_updates=8500, lr=0.000685994, gnorm=0.246, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=7855 epoch 006: 71 / 1689 loss=3.877, nll_loss=2.357, ppl=5.12, wps=548039, ups=1.11, wpb=492909, bsz=16035, num_updates=8500, lr=0.000685994, gnorm=0.246, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=7855 epoch 006: 71 / 1689 loss=3.877, nll_loss=2.357, ppl=5.12, wps=548039, ups=1.11, wpb=492909, bsz=16035, num_updates=8500, lr=0.000685994, gnorm=0.246, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=7855 epoch 006: 71 / 1689 loss=3.877, nll_loss=2.357, ppl=5.12, wps=548039, ups=1.11, wpb=492909, bsz=16035, num_updates=8500, lr=0.000685994, gnorm=0.246, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=7855 epoch 006: 171 / 1689 loss=3.876, nll_loss=2.356, ppl=5.12, wps=549057, ups=1.11, wpb=495166, bsz=16344.9, num_updates=8600, lr=0.000681994, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7945 epoch 006: 171 / 1689 loss=3.876, nll_loss=2.356, ppl=5.12, wps=549057, ups=1.11, wpb=495166, bsz=16344.9, num_updates=8600, lr=0.000681994, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7945 epoch 006: 171 / 1689 loss=3.876, nll_loss=2.356, ppl=5.12, wps=549057, ups=1.11, wpb=495166, bsz=16344.9, num_updates=8600, lr=0.000681994, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7945 epoch 006: 171 / 1689 loss=3.876, nll_loss=2.356, ppl=5.12, wps=549057, ups=1.11, wpb=495166, bsz=16344.9, num_updates=8600, lr=0.000681994, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7945 epoch 006: 171 / 1689 loss=3.876, nll_loss=2.356, ppl=5.12, wps=549057, ups=1.11, wpb=495166, bsz=16344.9, num_updates=8600, lr=0.000681994, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7945 epoch 006: 171 / 1689 loss=3.876, nll_loss=2.356, ppl=5.12, wps=549057, ups=1.11, wpb=495166, bsz=16344.9, num_updates=8600, lr=0.000681994, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=7945 epoch 006: 272 / 1689 loss=3.868, nll_loss=2.347, ppl=5.09, wps=548216, ups=1.11, wpb=496112, bsz=16501.4, num_updates=8700, lr=0.000678064, gnorm=0.239, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=8036 epoch 006: 272 / 1689 loss=3.868, nll_loss=2.347, ppl=5.09, wps=548216, ups=1.11, wpb=496112, bsz=16501.4, num_updates=8700, lr=0.000678064, gnorm=0.239, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=8036 epoch 006: 272 / 1689 loss=3.868, nll_loss=2.347, ppl=5.09, wps=548216, ups=1.11, wpb=496112, bsz=16501.4, num_updates=8700, lr=0.000678064, gnorm=0.239, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=8036 epoch 006: 272 / 1689 loss=3.868, nll_loss=2.347, ppl=5.09, wps=548216, ups=1.11, wpb=496112, bsz=16501.4, num_updates=8700, lr=0.000678064, gnorm=0.239, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=8036 epoch 006: 272 / 1689 loss=3.868, nll_loss=2.347, ppl=5.09, wps=548216, ups=1.11, wpb=496112, bsz=16501.4, num_updates=8700, lr=0.000678064, gnorm=0.239, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=8036 epoch 006: 272 / 1689 loss=3.868, nll_loss=2.347, ppl=5.09, wps=548216, ups=1.11, wpb=496112, bsz=16501.4, num_updates=8700, lr=0.000678064, gnorm=0.239, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=8036 epoch 006: 372 / 1689 loss=3.878, nll_loss=2.358, ppl=5.13, wps=548861, ups=1.11, wpb=494415, bsz=16348.2, num_updates=8800, lr=0.0006742, gnorm=0.257, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8126 epoch 006: 372 / 1689 loss=3.878, nll_loss=2.358, ppl=5.13, wps=548861, ups=1.11, wpb=494415, bsz=16348.2, num_updates=8800, lr=0.0006742, gnorm=0.257, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8126 epoch 006: 372 / 1689 loss=3.878, nll_loss=2.358, ppl=5.13, wps=548861, ups=1.11, wpb=494415, bsz=16348.2, num_updates=8800, lr=0.0006742, gnorm=0.257, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8126 epoch 006: 372 / 1689 loss=3.878, nll_loss=2.358, ppl=5.13, wps=548861, ups=1.11, wpb=494415, bsz=16348.2, num_updates=8800, lr=0.0006742, gnorm=0.257, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8126 epoch 006: 372 / 1689 loss=3.878, nll_loss=2.358, ppl=5.13, wps=548861, ups=1.11, wpb=494415, bsz=16348.2, num_updates=8800, lr=0.0006742, gnorm=0.257, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8126 epoch 006: 372 / 1689 loss=3.878, nll_loss=2.358, ppl=5.13, wps=548861, ups=1.11, wpb=494415, bsz=16348.2, num_updates=8800, lr=0.0006742, gnorm=0.257, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8126 epoch 006: 472 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=551017, ups=1.11, wpb=496181, bsz=16415.9, num_updates=8900, lr=0.000670402, gnorm=0.255, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=8216 epoch 006: 472 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=551017, ups=1.11, wpb=496181, bsz=16415.9, num_updates=8900, lr=0.000670402, gnorm=0.255, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=8216 epoch 006: 472 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=551017, ups=1.11, wpb=496181, bsz=16415.9, num_updates=8900, lr=0.000670402, gnorm=0.255, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=8216 epoch 006: 472 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=551017, ups=1.11, wpb=496181, bsz=16415.9, num_updates=8900, lr=0.000670402, gnorm=0.255, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=8216 epoch 006: 472 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=551017, ups=1.11, wpb=496181, bsz=16415.9, num_updates=8900, lr=0.000670402, gnorm=0.255, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=8216 epoch 006: 472 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=551017, ups=1.11, wpb=496181, bsz=16415.9, num_updates=8900, lr=0.000670402, gnorm=0.255, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=8216 epoch 006: 572 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=552058, ups=1.11, wpb=495502, bsz=16693.6, num_updates=9000, lr=0.000666667, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8306 epoch 006: 572 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=552058, ups=1.11, wpb=495502, bsz=16693.6, num_updates=9000, lr=0.000666667, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8306 epoch 006: 572 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=552058, ups=1.11, wpb=495502, bsz=16693.6, num_updates=9000, lr=0.000666667, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8306 epoch 006: 572 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=552058, ups=1.11, wpb=495502, bsz=16693.6, num_updates=9000, lr=0.000666667, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8306 epoch 006: 572 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=552058, ups=1.11, wpb=495502, bsz=16693.6, num_updates=9000, lr=0.000666667, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8306 epoch 006: 572 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=552058, ups=1.11, wpb=495502, bsz=16693.6, num_updates=9000, lr=0.000666667, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8306 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.865 | nll_loss 2.302 | ppl 4.93 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.865 epoch 006 | valid on 'valid' subset | loss 3.865 | nll_loss 2.302 | ppl 4.93 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.865 epoch 006 | valid on 'valid' subset | loss 3.865 | nll_loss 2.302 | ppl 4.93 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.865 epoch 006 | valid on 'valid' subset | loss 3.865 | nll_loss 2.302 | ppl 4.93 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.865 epoch 006 | valid on 'valid' subset | loss 3.865 | nll_loss 2.302 | ppl 4.93 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.865 epoch 006 | valid on 'valid' subset | loss 3.865 | nll_loss 2.302 | ppl 4.93 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.865 epoch 006: 672 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=451987, ups=0.92, wpb=493823, bsz=16862.1, num_updates=9100, lr=0.000662994, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=8415 epoch 006: 672 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=451987, ups=0.92, wpb=493823, bsz=16862.1, num_updates=9100, lr=0.000662994, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=8415 epoch 006: 672 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=451987, ups=0.92, wpb=493823, bsz=16862.1, num_updates=9100, lr=0.000662994, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=8415 epoch 006: 672 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=451987, ups=0.92, wpb=493823, bsz=16862.1, num_updates=9100, lr=0.000662994, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=8415 epoch 006: 672 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=451987, ups=0.92, wpb=493823, bsz=16862.1, num_updates=9100, lr=0.000662994, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=8415 epoch 006: 672 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=451987, ups=0.92, wpb=493823, bsz=16862.1, num_updates=9100, lr=0.000662994, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=8415 epoch 006: 772 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=549288, ups=1.11, wpb=496036, bsz=16440.3, num_updates=9200, lr=0.00065938, gnorm=0.242, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8505 epoch 006: 772 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=549288, ups=1.11, wpb=496036, bsz=16440.3, num_updates=9200, lr=0.00065938, gnorm=0.242, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8505 epoch 006: 772 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=549288, ups=1.11, wpb=496036, bsz=16440.3, num_updates=9200, lr=0.00065938, gnorm=0.242, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8505 epoch 006: 772 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=549288, ups=1.11, wpb=496036, bsz=16440.3, num_updates=9200, lr=0.00065938, gnorm=0.242, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8505 epoch 006: 772 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=549288, ups=1.11, wpb=496036, bsz=16440.3, num_updates=9200, lr=0.00065938, gnorm=0.242, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8505 epoch 006: 772 / 1689 loss=3.87, nll_loss=2.35, ppl=5.1, wps=549288, ups=1.11, wpb=496036, bsz=16440.3, num_updates=9200, lr=0.00065938, gnorm=0.242, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8505 epoch 006: 873 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=549527, ups=1.11, wpb=495145, bsz=16821.4, num_updates=9300, lr=0.000655826, gnorm=0.255, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=8595 epoch 006: 873 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=549527, ups=1.11, wpb=495145, bsz=16821.4, num_updates=9300, lr=0.000655826, gnorm=0.255, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=8595 epoch 006: 873 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=549527, ups=1.11, wpb=495145, bsz=16821.4, num_updates=9300, lr=0.000655826, gnorm=0.255, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=8595 epoch 006: 873 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=549527, ups=1.11, wpb=495145, bsz=16821.4, num_updates=9300, lr=0.000655826, gnorm=0.255, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=8595 epoch 006: 873 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=549527, ups=1.11, wpb=495145, bsz=16821.4, num_updates=9300, lr=0.000655826, gnorm=0.255, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=8595 epoch 006: 873 / 1689 loss=3.873, nll_loss=2.354, ppl=5.11, wps=549527, ups=1.11, wpb=495145, bsz=16821.4, num_updates=9300, lr=0.000655826, gnorm=0.255, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=8595 epoch 006: 973 / 1689 loss=3.875, nll_loss=2.357, ppl=5.12, wps=553223, ups=1.12, wpb=493201, bsz=16353.6, num_updates=9400, lr=0.000652328, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8684 epoch 006: 973 / 1689 loss=3.875, nll_loss=2.357, ppl=5.12, wps=553223, ups=1.12, wpb=493201, bsz=16353.6, num_updates=9400, lr=0.000652328, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8684 epoch 006: 973 / 1689 loss=3.875, nll_loss=2.357, ppl=5.12, wps=553223, ups=1.12, wpb=493201, bsz=16353.6, num_updates=9400, lr=0.000652328, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8684 epoch 006: 973 / 1689 loss=3.875, nll_loss=2.357, ppl=5.12, wps=553223, ups=1.12, wpb=493201, bsz=16353.6, num_updates=9400, lr=0.000652328, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8684 epoch 006: 973 / 1689 loss=3.875, nll_loss=2.357, ppl=5.12, wps=553223, ups=1.12, wpb=493201, bsz=16353.6, num_updates=9400, lr=0.000652328, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8684 epoch 006: 973 / 1689 loss=3.875, nll_loss=2.357, ppl=5.12, wps=553223, ups=1.12, wpb=493201, bsz=16353.6, num_updates=9400, lr=0.000652328, gnorm=0.248, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=8684 epoch 006: 1073 / 1689 loss=3.866, nll_loss=2.346, ppl=5.08, wps=556206, ups=1.12, wpb=494674, bsz=16464, num_updates=9500, lr=0.000648886, gnorm=0.243, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8773 epoch 006: 1073 / 1689 loss=3.866, nll_loss=2.346, ppl=5.08, wps=556206, ups=1.12, wpb=494674, bsz=16464, num_updates=9500, lr=0.000648886, gnorm=0.243, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8773 epoch 006: 1073 / 1689 loss=3.866, nll_loss=2.346, ppl=5.08, wps=556206, ups=1.12, wpb=494674, bsz=16464, num_updates=9500, lr=0.000648886, gnorm=0.243, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8773 epoch 006: 1073 / 1689 loss=3.866, nll_loss=2.346, ppl=5.08, wps=556206, ups=1.12, wpb=494674, bsz=16464, num_updates=9500, lr=0.000648886, gnorm=0.243, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8773 epoch 006: 1073 / 1689 loss=3.866, nll_loss=2.346, ppl=5.08, wps=556206, ups=1.12, wpb=494674, bsz=16464, num_updates=9500, lr=0.000648886, gnorm=0.243, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8773 epoch 006: 1073 / 1689 loss=3.866, nll_loss=2.346, ppl=5.08, wps=556206, ups=1.12, wpb=494674, bsz=16464, num_updates=9500, lr=0.000648886, gnorm=0.243, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8773 epoch 006: 1173 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=554060, ups=1.12, wpb=494269, bsz=16737, num_updates=9600, lr=0.000645497, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=8863 epoch 006: 1173 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=554060, ups=1.12, wpb=494269, bsz=16737, num_updates=9600, lr=0.000645497, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=8863 epoch 006: 1173 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=554060, ups=1.12, wpb=494269, bsz=16737, num_updates=9600, lr=0.000645497, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=8863 epoch 006: 1173 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=554060, ups=1.12, wpb=494269, bsz=16737, num_updates=9600, lr=0.000645497, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=8863 epoch 006: 1173 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=554060, ups=1.12, wpb=494269, bsz=16737, num_updates=9600, lr=0.000645497, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=8863 epoch 006: 1173 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=554060, ups=1.12, wpb=494269, bsz=16737, num_updates=9600, lr=0.000645497, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=8863 epoch 006: 1273 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=554250, ups=1.12, wpb=495636, bsz=16163.1, num_updates=9700, lr=0.000642161, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=8952 epoch 006: 1273 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=554250, ups=1.12, wpb=495636, bsz=16163.1, num_updates=9700, lr=0.000642161, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=8952 epoch 006: 1273 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=554250, ups=1.12, wpb=495636, bsz=16163.1, num_updates=9700, lr=0.000642161, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=8952 epoch 006: 1273 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=554250, ups=1.12, wpb=495636, bsz=16163.1, num_updates=9700, lr=0.000642161, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=8952 epoch 006: 1273 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=554250, ups=1.12, wpb=495636, bsz=16163.1, num_updates=9700, lr=0.000642161, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=8952 epoch 006: 1273 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=554250, ups=1.12, wpb=495636, bsz=16163.1, num_updates=9700, lr=0.000642161, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=8952 epoch 006: 1373 / 1689 loss=3.861, nll_loss=2.341, ppl=5.07, wps=553925, ups=1.11, wpb=497231, bsz=16715.9, num_updates=9800, lr=0.000638877, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=9042 epoch 006: 1373 / 1689 loss=3.861, nll_loss=2.341, ppl=5.07, wps=553925, ups=1.11, wpb=497231, bsz=16715.9, num_updates=9800, lr=0.000638877, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=9042 epoch 006: 1373 / 1689 loss=3.861, nll_loss=2.341, ppl=5.07, wps=553925, ups=1.11, wpb=497231, bsz=16715.9, num_updates=9800, lr=0.000638877, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=9042 epoch 006: 1373 / 1689 loss=3.861, nll_loss=2.341, ppl=5.07, wps=553925, ups=1.11, wpb=497231, bsz=16715.9, num_updates=9800, lr=0.000638877, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=9042 epoch 006: 1373 / 1689 loss=3.861, nll_loss=2.341, ppl=5.07, wps=553925, ups=1.11, wpb=497231, bsz=16715.9, num_updates=9800, lr=0.000638877, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=9042 epoch 006: 1373 / 1689 loss=3.861, nll_loss=2.341, ppl=5.07, wps=553925, ups=1.11, wpb=497231, bsz=16715.9, num_updates=9800, lr=0.000638877, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=9042 epoch 006: 1475 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=544607, ups=1.1, wpb=495014, bsz=16253, num_updates=9900, lr=0.000635642, gnorm=0.244, clip=0, loss_scale=1, train_wall=90, gb_free=21.2, wall=9133 epoch 006: 1475 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=544607, ups=1.1, wpb=495014, bsz=16253, num_updates=9900, lr=0.000635642, gnorm=0.244, clip=0, loss_scale=1, train_wall=90, gb_free=21.2, wall=9133 epoch 006: 1475 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=544607, ups=1.1, wpb=495014, bsz=16253, num_updates=9900, lr=0.000635642, gnorm=0.244, clip=0, loss_scale=1, train_wall=90, gb_free=21.2, wall=9133 epoch 006: 1475 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=544607, ups=1.1, wpb=495014, bsz=16253, num_updates=9900, lr=0.000635642, gnorm=0.244, clip=0, loss_scale=1, train_wall=90, gb_free=21.2, wall=9133 epoch 006: 1475 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=544607, ups=1.1, wpb=495014, bsz=16253, num_updates=9900, lr=0.000635642, gnorm=0.244, clip=0, loss_scale=1, train_wall=90, gb_free=21.2, wall=9133 epoch 006: 1475 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=544607, ups=1.1, wpb=495014, bsz=16253, num_updates=9900, lr=0.000635642, gnorm=0.244, clip=0, loss_scale=1, train_wall=90, gb_free=21.2, wall=9133 epoch 006: 1575 / 1689 loss=3.853, nll_loss=2.333, ppl=5.04, wps=548344, ups=1.11, wpb=496086, bsz=16840.9, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9223 epoch 006: 1575 / 1689 loss=3.853, nll_loss=2.333, ppl=5.04, wps=548344, ups=1.11, wpb=496086, bsz=16840.9, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9223 epoch 006: 1575 / 1689 loss=3.853, nll_loss=2.333, ppl=5.04, wps=548344, ups=1.11, wpb=496086, bsz=16840.9, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9223 epoch 006: 1575 / 1689 loss=3.853, nll_loss=2.333, ppl=5.04, wps=548344, ups=1.11, wpb=496086, bsz=16840.9, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9223 epoch 006: 1575 / 1689 loss=3.853, nll_loss=2.333, ppl=5.04, wps=548344, ups=1.11, wpb=496086, bsz=16840.9, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9223 epoch 006: 1575 / 1689 loss=3.853, nll_loss=2.333, ppl=5.04, wps=548344, ups=1.11, wpb=496086, bsz=16840.9, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9223 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.845 | nll_loss 2.282 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.845 epoch 006 | valid on 'valid' subset | loss 3.845 | nll_loss 2.282 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.845 epoch 006 | valid on 'valid' subset | loss 3.845 | nll_loss 2.282 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.845 epoch 006 | valid on 'valid' subset | loss 3.845 | nll_loss 2.282 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.845 epoch 006 | valid on 'valid' subset | loss 3.845 | nll_loss 2.282 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.845 epoch 006 | valid on 'valid' subset | loss 3.845 | nll_loss 2.282 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.845 epoch 006: 1675 / 1689 loss=3.858, nll_loss=2.338, ppl=5.06, wps=403205, ups=0.81, wpb=495407, bsz=16485.5, num_updates=10100, lr=0.000629317, gnorm=0.239, clip=0, loss_scale=1, train_wall=99, gb_free=22, wall=9346 epoch 006: 1675 / 1689 loss=3.858, nll_loss=2.338, ppl=5.06, wps=403205, ups=0.81, wpb=495407, bsz=16485.5, num_updates=10100, lr=0.000629317, gnorm=0.239, clip=0, loss_scale=1, train_wall=99, gb_free=22, wall=9346 epoch 006: 1675 / 1689 loss=3.858, nll_loss=2.338, ppl=5.06, wps=403205, ups=0.81, wpb=495407, bsz=16485.5, num_updates=10100, lr=0.000629317, gnorm=0.239, clip=0, loss_scale=1, train_wall=99, gb_free=22, wall=9346 epoch 006: 1675 / 1689 loss=3.858, nll_loss=2.338, ppl=5.06, wps=403205, ups=0.81, wpb=495407, bsz=16485.5, num_updates=10100, lr=0.000629317, gnorm=0.239, clip=0, loss_scale=1, train_wall=99, gb_free=22, wall=9346 epoch 006: 1675 / 1689 loss=3.858, nll_loss=2.338, ppl=5.06, wps=403205, ups=0.81, wpb=495407, bsz=16485.5, num_updates=10100, lr=0.000629317, gnorm=0.239, clip=0, loss_scale=1, train_wall=99, gb_free=22, wall=9346 epoch 006: 1675 / 1689 loss=3.858, nll_loss=2.338, ppl=5.06, wps=403205, ups=0.81, wpb=495407, bsz=16485.5, num_updates=10100, lr=0.000629317, gnorm=0.239, clip=0, loss_scale=1, train_wall=99, gb_free=22, wall=9346 end of epoch 6 (average epoch stats below) epoch 006 | loss 3.868 | nll_loss 2.348 | ppl 5.09 | wps 532255 | ups 1.07 | wpb 495133 | bsz 16504.4 | num_updates 10114 | lr 0.000628881 | gnorm 0.246 | clip 0 | loss_scale 1 | train_wall 1502 | gb_free 22.8 | wall 9358 epoch 006 | loss 3.868 | nll_loss 2.348 | ppl 5.09 | wps 532255 | ups 1.07 | wpb 495133 | bsz 16504.4 | num_updates 10114 | lr 0.000628881 | gnorm 0.246 | clip 0 | loss_scale 1 | train_wall 1502 | gb_free 22.8 | wall 9358 epoch 006 | loss 3.868 | nll_loss 2.348 | ppl 5.09 | wps 532255 | ups 1.07 | wpb 495133 | bsz 16504.4 | num_updates 10114 | lr 0.000628881 | gnorm 0.246 | clip 0 | loss_scale 1 | train_wall 1502 | gb_free 22.8 | wall 9358 epoch 006 | loss 3.868 | nll_loss 2.348 | ppl 5.09 | wps 532255 | ups 1.07 | wpb 495133 | bsz 16504.4 | num_updates 10114 | lr 0.000628881 | gnorm 0.246 | clip 0 | loss_scale 1 | train_wall 1502 | gb_free 22.8 | wall 9358 epoch 006 | loss 3.868 | nll_loss 2.348 | ppl 5.09 | wps 532255 | ups 1.07 | wpb 495133 | bsz 16504.4 | num_updates 10114 | lr 0.000628881 | gnorm 0.246 | clip 0 | loss_scale 1 | train_wall 1502 | gb_free 22.8 | wall 9358 epoch 006 | loss 3.868 | nll_loss 2.348 | ppl 5.09 | wps 532255 | ups 1.07 | wpb 495133 | bsz 16504.4 | num_updates 10114 | lr 0.000628881 | gnorm 0.246 | clip 0 | loss_scale 1 | train_wall 1502 | gb_free 22.8 | wall 9358 Start iterating over samples epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 86 / 1689 loss=3.835, nll_loss=2.311, ppl=4.96, wps=547139, ups=1.12, wpb=490616, bsz=16304.6, num_updates=10200, lr=0.000626224, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=9436 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 186 / 1689 loss=3.823, nll_loss=2.298, ppl=4.92, wps=552671, ups=1.11, wpb=495907, bsz=16713.6, num_updates=10300, lr=0.000623177, gnorm=0.231, clip=0, loss_scale=1, train_wall=89, gb_free=20.6, wall=9525 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 286 / 1689 loss=3.831, nll_loss=2.307, ppl=4.95, wps=552979, ups=1.12, wpb=495352, bsz=16531.4, num_updates=10400, lr=0.000620174, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=9615 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 386 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=551487, ups=1.11, wpb=494926, bsz=16206, num_updates=10500, lr=0.000617213, gnorm=0.243, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=9705 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 487 / 1689 loss=3.835, nll_loss=2.312, ppl=4.97, wps=547866, ups=1.11, wpb=495541, bsz=16473.5, num_updates=10600, lr=0.000614295, gnorm=0.24, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=9795 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 587 / 1689 loss=3.832, nll_loss=2.309, ppl=4.96, wps=554407, ups=1.12, wpb=496059, bsz=16510.4, num_updates=10700, lr=0.000611418, gnorm=0.24, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=9885 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 687 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=554087, ups=1.12, wpb=494694, bsz=16280.5, num_updates=10800, lr=0.000608581, gnorm=0.231, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=9974 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 787 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=549747, ups=1.11, wpb=493738, bsz=16337.5, num_updates=10900, lr=0.000605783, gnorm=0.225, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=10064 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 epoch 007: 887 / 1689 loss=3.83, nll_loss=2.307, ppl=4.95, wps=550903, ups=1.11, wpb=495072, bsz=16254.2, num_updates=11000, lr=0.000603023, gnorm=0.239, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=10154 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007 | valid on 'valid' subset | loss 3.814 | nll_loss 2.255 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.814 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 987 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=383438, ups=0.77, wpb=496559, bsz=16331, num_updates=11100, lr=0.0006003, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=10283 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1087 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=557694, ups=1.12, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.231, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10372 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1187 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=559766, ups=1.13, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=10461 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1287 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553377, ups=1.12, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=10550 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1387 / 1689 loss=3.832, nll_loss=2.31, ppl=4.96, wps=551922, ups=1.12, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=10640 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1487 / 1689 loss=3.818, nll_loss=2.295, ppl=4.91, wps=552502, ups=1.11, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=10730 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1587 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=553227, ups=1.12, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=10819 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 epoch 007: 1688 / 1689 loss=3.818, nll_loss=2.294, ppl=4.9, wps=545823, ups=1.1, wpb=494641, bsz=16436.9, num_updates=11800, lr=0.000582223, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=10910 end of epoch 7 (average epoch stats below) epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 epoch 007 | loss 3.828 | nll_loss 2.305 | ppl 4.94 | wps 538113 | ups 1.09 | wpb 495126 | bsz 16503.8 | num_updates 11801 | lr 0.000582198 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 23 | wall 10910 Start iterating over samples epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 99 / 1689 loss=3.789, nll_loss=2.261, ppl=4.79, wps=522707, ups=1.06, wpb=491873, bsz=16581.4, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=11004 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 epoch 008: 199 / 1689 loss=3.792, nll_loss=2.264, ppl=4.8, wps=550798, ups=1.11, wpb=494354, bsz=16661.1, num_updates=12000, lr=0.00057735, gnorm=0.235, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=11094 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008 | valid on 'valid' subset | loss 3.807 | nll_loss 2.24 | ppl 4.72 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.807 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 299 / 1689 loss=3.798, nll_loss=2.272, ppl=4.83, wps=459404, ups=0.93, wpb=495513, bsz=16564.1, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=11202 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 399 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=548232, ups=1.11, wpb=494941, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=11292 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 499 / 1689 loss=3.804, nll_loss=2.278, ppl=4.85, wps=552610, ups=1.12, wpb=493491, bsz=15965.6, num_updates=12300, lr=0.000570266, gnorm=0.224, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=11381 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 600 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=545825, ups=1.1, wpb=495292, bsz=16472.5, num_updates=12400, lr=0.000567962, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=11472 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 700 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=546448, ups=1.1, wpb=495455, bsz=16568.2, num_updates=12500, lr=0.000565685, gnorm=0.236, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=11563 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 800 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=547016, ups=1.1, wpb=496138, bsz=16469.1, num_updates=12600, lr=0.000563436, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=11653 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 900 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=553585, ups=1.12, wpb=495852, bsz=16568.6, num_updates=12700, lr=0.000561214, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=11743 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1000 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=547689, ups=1.11, wpb=494070, bsz=16480.7, num_updates=12800, lr=0.000559017, gnorm=0.228, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=11833 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1100 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=551925, ups=1.11, wpb=495237, bsz=16429.4, num_updates=12900, lr=0.000556846, gnorm=0.209, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=11923 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 epoch 008: 1201 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=549926, ups=1.11, wpb=497470, bsz=16160.6, num_updates=13000, lr=0.0005547, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=12013 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008 | valid on 'valid' subset | loss 3.793 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.793 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1301 / 1689 loss=3.791, nll_loss=2.265, ppl=4.81, wps=450479, ups=0.91, wpb=495492, bsz=16776.3, num_updates=13100, lr=0.000552579, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12123 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1401 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=554252, ups=1.12, wpb=496441, bsz=16561.8, num_updates=13200, lr=0.000550482, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=12213 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1501 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=551322, ups=1.11, wpb=494886, bsz=16486.5, num_updates=13300, lr=0.000548408, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=12303 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 epoch 008: 1601 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550035, ups=1.11, wpb=495036, bsz=16521, num_updates=13400, lr=0.000546358, gnorm=0.233, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=12393 end of epoch 8 (average epoch stats below) epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 epoch 008 | loss 3.797 | nll_loss 2.272 | ppl 4.83 | wps 534994 | ups 1.08 | wpb 495126 | bsz 16506.9 | num_updates 13488 | lr 0.000544573 | gnorm 0.224 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12472 Start iterating over samples epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 12 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=544661, ups=1.11, wpb=491583, bsz=16570.7, num_updates=13500, lr=0.000544331, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=12483 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 113 / 1689 loss=3.765, nll_loss=2.235, ppl=4.71, wps=546924, ups=1.1, wpb=496144, bsz=16405.9, num_updates=13600, lr=0.000542326, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=12574 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 213 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=548346, ups=1.11, wpb=493635, bsz=16602.7, num_updates=13700, lr=0.000540343, gnorm=0.223, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=12664 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 313 / 1689 loss=3.766, nll_loss=2.237, ppl=4.71, wps=544418, ups=1.1, wpb=497046, bsz=16693.4, num_updates=13800, lr=0.000538382, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=12755 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 413 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=546268, ups=1.1, wpb=494844, bsz=16148.6, num_updates=13900, lr=0.000536442, gnorm=0.214, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=12846 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 epoch 009: 513 / 1689 loss=3.77, nll_loss=2.241, ppl=4.73, wps=546344, ups=1.1, wpb=495217, bsz=16525.7, num_updates=14000, lr=0.000534522, gnorm=0.215, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=12936 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009 | valid on 'valid' subset | loss 3.78 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.78 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 613 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=449001, ups=0.91, wpb=495098, bsz=16381.7, num_updates=14100, lr=0.000532624, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=13047 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 714 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=544442, ups=1.1, wpb=495569, bsz=16518.2, num_updates=14200, lr=0.000530745, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=13138 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 815 / 1689 loss=3.775, nll_loss=2.248, ppl=4.75, wps=545792, ups=1.1, wpb=495273, bsz=16702.6, num_updates=14300, lr=0.000528886, gnorm=0.215, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=13228 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 915 / 1689 loss=3.779, nll_loss=2.252, ppl=4.76, wps=549710, ups=1.11, wpb=495424, bsz=16615.2, num_updates=14400, lr=0.000527046, gnorm=0.217, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=13318 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1015 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=560785, ups=1.13, wpb=495011, bsz=16433, num_updates=14500, lr=0.000525226, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=13407 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1115 / 1689 loss=3.778, nll_loss=2.251, ppl=4.76, wps=557270, ups=1.13, wpb=495124, bsz=16768.6, num_updates=14600, lr=0.000523424, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=13496 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1215 / 1689 loss=3.771, nll_loss=2.243, ppl=4.74, wps=560352, ups=1.13, wpb=495224, bsz=16146.6, num_updates=14700, lr=0.000521641, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=13584 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1315 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=558966, ups=1.13, wpb=495555, bsz=16338.8, num_updates=14800, lr=0.000519875, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=13673 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1415 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=555764, ups=1.12, wpb=495142, bsz=16720.5, num_updates=14900, lr=0.000518128, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13762 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 epoch 009: 1515 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=555940, ups=1.12, wpb=495860, bsz=16404.5, num_updates=15000, lr=0.000516398, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13851 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009 | valid on 'valid' subset | loss 3.779 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.779 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 epoch 009: 1615 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=459911, ups=0.93, wpb=495430, bsz=16689.6, num_updates=15100, lr=0.000514685, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=13959 end of epoch 9 (average epoch stats below) epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 epoch 009 | loss 3.773 | nll_loss 2.245 | ppl 4.74 | wps 537646 | ups 1.09 | wpb 495150 | bsz 16505.8 | num_updates 15174 | lr 0.000513428 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14024 Start iterating over samples epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 26 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=540016, ups=1.1, wpb=491847, bsz=16631.2, num_updates=15200, lr=0.000512989, gnorm=0.205, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=14050 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 126 / 1689 loss=3.738, nll_loss=2.205, ppl=4.61, wps=554238, ups=1.12, wpb=495206, bsz=16439.5, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=14139 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 226 / 1689 loss=3.751, nll_loss=2.22, ppl=4.66, wps=557334, ups=1.12, wpb=495467, bsz=16673.3, num_updates=15400, lr=0.000509647, gnorm=0.208, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=14228 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 326 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=556085, ups=1.12, wpb=495484, bsz=16546.3, num_updates=15500, lr=0.000508001, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=14317 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 427 / 1689 loss=3.747, nll_loss=2.216, ppl=4.65, wps=549923, ups=1.11, wpb=495658, bsz=16556.9, num_updates=15600, lr=0.00050637, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=14407 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 527 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=555381, ups=1.12, wpb=496716, bsz=16339, num_updates=15700, lr=0.000504754, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=19.8, wall=14497 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 628 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=541994, ups=1.09, wpb=495488, bsz=16858.2, num_updates=15800, lr=0.000503155, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=14588 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 728 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=550849, ups=1.11, wpb=495485, bsz=16544.2, num_updates=15900, lr=0.00050157, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14678 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 epoch 010: 828 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=551727, ups=1.11, wpb=495899, bsz=16453.5, num_updates=16000, lr=0.0005, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=14768 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010 | valid on 'valid' subset | loss 3.762 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.762 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 928 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=449672, ups=0.91, wpb=496407, bsz=16754.6, num_updates=16100, lr=0.000498445, gnorm=0.204, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=14878 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1028 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=550769, ups=1.11, wpb=495472, bsz=16565.4, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=14968 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1128 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=553790, ups=1.12, wpb=494671, bsz=16444.6, num_updates=16300, lr=0.000495377, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=15058 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1228 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=550990, ups=1.11, wpb=495342, bsz=16361.8, num_updates=16400, lr=0.000493865, gnorm=0.205, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=15147 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1328 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=551920, ups=1.11, wpb=495076, bsz=16617.7, num_updates=16500, lr=0.000492366, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15237 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1428 / 1689 loss=3.754, nll_loss=2.225, ppl=4.67, wps=548768, ups=1.11, wpb=495594, bsz=16621.3, num_updates=16600, lr=0.000490881, gnorm=0.209, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=15327 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1528 / 1689 loss=3.758, nll_loss=2.23, ppl=4.69, wps=549636, ups=1.11, wpb=493506, bsz=16388.7, num_updates=16700, lr=0.000489409, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=15417 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 epoch 010: 1629 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=542180, ups=1.09, wpb=495789, bsz=16213.8, num_updates=16800, lr=0.00048795, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.7, wall=15509 end of epoch 10 (average epoch stats below) epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 epoch 010 | loss 3.752 | nll_loss 2.223 | ppl 4.67 | wps 542879 | ups 1.1 | wpb 495136 | bsz 16505.6 | num_updates 16860 | lr 0.000487081 | gnorm 0.207 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.4 | wall 15562 Start iterating over samples epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 40 / 1689 loss=3.739, nll_loss=2.207, ppl=4.62, wps=545920, ups=1.11, wpb=490717, bsz=16015.3, num_updates=16900, lr=0.000486504, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=15599 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 epoch 011: 140 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=551724, ups=1.11, wpb=496096, bsz=16704.5, num_updates=17000, lr=0.000485071, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=15689 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011 | valid on 'valid' subset | loss 3.766 | nll_loss 2.198 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.762 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 240 / 1689 loss=3.725, nll_loss=2.192, ppl=4.57, wps=388114, ups=0.78, wpb=495571, bsz=16581, num_updates=17100, lr=0.000483651, gnorm=0.207, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=15816 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 340 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=558598, ups=1.13, wpb=495947, bsz=16990.6, num_updates=17200, lr=0.000482243, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=15905 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 441 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=547211, ups=1.11, wpb=494229, bsz=16339, num_updates=17300, lr=0.000480847, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=15995 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 541 / 1689 loss=3.743, nll_loss=2.212, ppl=4.63, wps=552762, ups=1.12, wpb=493705, bsz=16601.8, num_updates=17400, lr=0.000479463, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=16085 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 641 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=558452, ups=1.13, wpb=496345, bsz=16232.4, num_updates=17500, lr=0.000478091, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=16174 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 741 / 1689 loss=3.736, nll_loss=2.205, ppl=4.61, wps=549982, ups=1.11, wpb=494835, bsz=16482.3, num_updates=17600, lr=0.000476731, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=16263 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 841 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=551781, ups=1.11, wpb=495745, bsz=16697.7, num_updates=17700, lr=0.000475383, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16353 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 941 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=548246, ups=1.1, wpb=496770, bsz=16459.3, num_updates=17800, lr=0.000474045, gnorm=0.197, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=16444 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1042 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=536781, ups=1.09, wpb=493426, bsz=16739, num_updates=17900, lr=0.000472719, gnorm=0.198, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=16536 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 epoch 011: 1142 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=544213, ups=1.1, wpb=495754, bsz=16661.7, num_updates=18000, lr=0.000471405, gnorm=0.211, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=16627 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1242 / 1689 loss=3.734, nll_loss=2.202, ppl=4.6, wps=416992, ups=0.84, wpb=495544, bsz=16730.8, num_updates=18100, lr=0.0004701, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=21.5, wall=16746 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1343 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=543020, ups=1.09, wpb=495960, bsz=16376.2, num_updates=18200, lr=0.000468807, gnorm=0.191, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=16837 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1443 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=552664, ups=1.12, wpb=493911, bsz=16134.7, num_updates=18300, lr=0.000467525, gnorm=0.206, clip=0, loss_scale=1, train_wall=87, gb_free=21, wall=16927 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1543 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=549985, ups=1.11, wpb=494757, bsz=16019, num_updates=18400, lr=0.000466252, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=17016 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 epoch 011: 1643 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=555068, ups=1.12, wpb=497092, bsz=16443, num_updates=18500, lr=0.000464991, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=17106 end of epoch 11 (average epoch stats below) epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 epoch 011 | loss 3.735 | nll_loss 2.204 | ppl 4.61 | wps 526899 | ups 1.06 | wpb 495128 | bsz 16507.4 | num_updates 18546 | lr 0.000464414 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1505 | gb_free 22.7 | wall 17146 Start iterating over samples epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 54 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=543641, ups=1.11, wpb=491026, bsz=16367.2, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=20.2, wall=17196 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 154 / 1689 loss=3.709, nll_loss=2.174, ppl=4.51, wps=549669, ups=1.11, wpb=496084, bsz=16355.1, num_updates=18700, lr=0.000462497, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=17287 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 254 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=553010, ups=1.12, wpb=495021, bsz=16651.3, num_updates=18800, lr=0.000461266, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17376 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 354 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=550509, ups=1.11, wpb=494065, bsz=16425, num_updates=18900, lr=0.000460044, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=17466 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 epoch 012: 454 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=555855, ups=1.12, wpb=494486, bsz=16468.2, num_updates=19000, lr=0.000458831, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=17555 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012 | valid on 'valid' subset | loss 3.75 | nll_loss 2.188 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.75 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 554 / 1689 loss=3.719, nll_loss=2.185, ppl=4.55, wps=449001, ups=0.91, wpb=495919, bsz=16426.8, num_updates=19100, lr=0.000457629, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=17665 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 654 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=556915, ups=1.12, wpb=496805, bsz=16748.5, num_updates=19200, lr=0.000456435, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=17755 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 755 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=546638, ups=1.1, wpb=495857, bsz=16421.1, num_updates=19300, lr=0.000455251, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17845 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 855 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545656, ups=1.1, wpb=495936, bsz=16741.2, num_updates=19400, lr=0.000454077, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=17936 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 955 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=549249, ups=1.11, wpb=495496, bsz=16702, num_updates=19500, lr=0.000452911, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=18026 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1055 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=557484, ups=1.13, wpb=495377, bsz=16091, num_updates=19600, lr=0.000451754, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=19.9, wall=18115 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1155 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=551884, ups=1.11, wpb=495859, bsz=16378.6, num_updates=19700, lr=0.000450606, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=18205 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1256 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=545593, ups=1.1, wpb=494436, bsz=16755.9, num_updates=19800, lr=0.000449467, gnorm=0.191, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=18296 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1356 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=550019, ups=1.11, wpb=495352, bsz=16858.1, num_updates=19900, lr=0.000448336, gnorm=0.192, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=18386 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 epoch 012: 1456 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=554467, ups=1.12, wpb=494691, bsz=16361.2, num_updates=20000, lr=0.000447214, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18475 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012 | valid on 'valid' subset | loss 3.743 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.743 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1556 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=451611, ups=0.91, wpb=495220, bsz=16587.5, num_updates=20100, lr=0.0004461, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=18585 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 epoch 012: 1656 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=554805, ups=1.12, wpb=496273, bsz=16519, num_updates=20200, lr=0.000444994, gnorm=0.197, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=18674 end of epoch 12 (average epoch stats below) epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 epoch 012 | loss 3.72 | nll_loss 2.188 | ppl 4.56 | wps 536580 | ups 1.08 | wpb 495124 | bsz 16507 | num_updates 20233 | lr 0.000444631 | gnorm 0.193 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23 | wall 18703 Start iterating over samples epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 67 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548908, ups=1.12, wpb=491482, bsz=16053.9, num_updates=20300, lr=0.000443897, gnorm=0.206, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=18764 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 168 / 1689 loss=3.697, nll_loss=2.162, ppl=4.47, wps=548201, ups=1.11, wpb=495151, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=18854 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 268 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=552423, ups=1.12, wpb=495199, bsz=16504.6, num_updates=20500, lr=0.000441726, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=18944 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 368 / 1689 loss=3.704, nll_loss=2.169, ppl=4.5, wps=548780, ups=1.11, wpb=494378, bsz=16818.6, num_updates=20600, lr=0.000440653, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=19034 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 468 / 1689 loss=3.711, nll_loss=2.178, ppl=4.52, wps=549550, ups=1.11, wpb=494100, bsz=16503.8, num_updates=20700, lr=0.000439587, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=19124 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 568 / 1689 loss=3.709, nll_loss=2.175, ppl=4.51, wps=550173, ups=1.11, wpb=495422, bsz=16056.1, num_updates=20800, lr=0.000438529, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=19214 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 668 / 1689 loss=3.709, nll_loss=2.175, ppl=4.52, wps=553920, ups=1.12, wpb=496334, bsz=17050.3, num_updates=20900, lr=0.000437479, gnorm=0.191, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19303 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 epoch 013: 768 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=551311, ups=1.11, wpb=494896, bsz=16508, num_updates=21000, lr=0.000436436, gnorm=0.195, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=19393 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013 | valid on 'valid' subset | loss 3.736 | nll_loss 2.168 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.736 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 868 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=389513, ups=0.79, wpb=495758, bsz=16727.5, num_updates=21100, lr=0.0004354, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=21.2, wall=19520 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 969 / 1689 loss=3.703, nll_loss=2.168, ppl=4.5, wps=557579, ups=1.12, wpb=496200, bsz=16496.4, num_updates=21200, lr=0.000434372, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=19609 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1069 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=556158, ups=1.12, wpb=495329, bsz=16472.8, num_updates=21300, lr=0.000433351, gnorm=0.199, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=19698 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1169 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=553534, ups=1.12, wpb=494222, bsz=16406.8, num_updates=21400, lr=0.000432338, gnorm=0.194, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=19788 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1269 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=557453, ups=1.12, wpb=495704, bsz=16266.4, num_updates=21500, lr=0.000431331, gnorm=0.194, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=19877 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1369 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=551068, ups=1.11, wpb=494931, bsz=16774.7, num_updates=21600, lr=0.000430331, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.8, wall=19966 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1469 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=552456, ups=1.11, wpb=495738, bsz=16774.9, num_updates=21700, lr=0.000429339, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=20056 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1569 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=552680, ups=1.12, wpb=494914, bsz=16417, num_updates=21800, lr=0.000428353, gnorm=0.197, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=20146 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 epoch 013: 1669 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=545946, ups=1.1, wpb=495705, bsz=16408.2, num_updates=21900, lr=0.000427374, gnorm=0.192, clip=0, loss_scale=4, train_wall=90, gb_free=21.6, wall=20236 end of epoch 13 (average epoch stats below) epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 epoch 013 | loss 3.707 | nll_loss 2.173 | ppl 4.51 | wps 538584 | ups 1.09 | wpb 495108 | bsz 16505.8 | num_updates 21920 | lr 0.000427179 | gnorm 0.191 | clip 0 | loss_scale 4 | train_wall 1496 | gb_free 22.9 | wall 20254 Start iterating over samples epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 epoch 014: 80 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=536572, ups=1.09, wpb=492512, bsz=16246.1, num_updates=22000, lr=0.000426401, gnorm=0.187, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=20328 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.725 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.725 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 180 / 1689 loss=3.682, nll_loss=2.145, ppl=4.42, wps=457302, ups=0.92, wpb=497353, bsz=16513.6, num_updates=22100, lr=0.000425436, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20437 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 281 / 1689 loss=3.689, nll_loss=2.152, ppl=4.45, wps=548573, ups=1.1, wpb=496920, bsz=16350.2, num_updates=22200, lr=0.000424476, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=20528 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 381 / 1689 loss=3.691, nll_loss=2.155, ppl=4.45, wps=549808, ups=1.11, wpb=495802, bsz=16751, num_updates=22300, lr=0.000423524, gnorm=0.186, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=20618 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 481 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=552502, ups=1.12, wpb=495248, bsz=16509.2, num_updates=22400, lr=0.000422577, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=20707 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 582 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=550150, ups=1.11, wpb=495494, bsz=16481, num_updates=22500, lr=0.000421637, gnorm=0.191, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20797 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 682 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=553658, ups=1.12, wpb=494117, bsz=16364.8, num_updates=22600, lr=0.000420703, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=20887 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 782 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=552766, ups=1.12, wpb=494935, bsz=16429.7, num_updates=22700, lr=0.000419775, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=20976 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 882 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=547776, ups=1.11, wpb=495546, bsz=16623.3, num_updates=22800, lr=0.000418854, gnorm=0.19, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=21067 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 982 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=544931, ups=1.1, wpb=494455, bsz=16710, num_updates=22900, lr=0.000417938, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=21157 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 epoch 014: 1082 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=544980, ups=1.1, wpb=494752, bsz=16620.9, num_updates=23000, lr=0.000417029, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=21248 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014 | valid on 'valid' subset | loss 3.729 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.725 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1182 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=488326, ups=0.99, wpb=493871, bsz=16492.5, num_updates=23100, lr=0.000416125, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=21349 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1282 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=554986, ups=1.12, wpb=496228, bsz=16191.7, num_updates=23200, lr=0.000415227, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=21439 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1383 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=545703, ups=1.1, wpb=495234, bsz=16327.4, num_updates=23300, lr=0.000414335, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=21530 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1483 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549707, ups=1.11, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=21620 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1583 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=550088, ups=1.11, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=21710 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 epoch 014: 1683 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=545748, ups=1.11, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21800 end of epoch 14 (average epoch stats below) epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 epoch 014 | loss 3.696 | nll_loss 2.161 | ppl 4.47 | wps 538189 | ups 1.09 | wpb 495141 | bsz 16504 | num_updates 23606 | lr 0.000411641 | gnorm 0.187 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 23.7 | wall 21805 Start iterating over samples epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 94 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=536862, ups=1.1, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.195, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=21892 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 194 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=550739, ups=1.11, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=21982 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 294 / 1689 loss=3.677, nll_loss=2.139, ppl=4.41, wps=547379, ups=1.11, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=22072 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 epoch 015: 395 / 1689 loss=3.68, nll_loss=2.143, ppl=4.42, wps=548197, ups=1.1, wpb=496538, bsz=16810.8, num_updates=24000, lr=0.000408248, gnorm=0.179, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=22163 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.716 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 495 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=457081, ups=0.92, wpb=495745, bsz=16783.8, num_updates=24100, lr=0.0004074, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=22271 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 595 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=551837, ups=1.12, wpb=494236, bsz=16320.8, num_updates=24200, lr=0.000406558, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=22361 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 695 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=546767, ups=1.1, wpb=495050, bsz=16679.2, num_updates=24300, lr=0.00040572, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=22451 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 795 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=549371, ups=1.11, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=22541 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 895 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=547711, ups=1.11, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=22632 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 996 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=548002, ups=1.1, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21, wall=22722 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1096 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551746, ups=1.11, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=22812 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1196 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552883, ups=1.12, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=22901 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1296 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=549349, ups=1.11, wpb=495260, bsz=16984.5, num_updates=24900, lr=0.000400802, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=22992 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 epoch 015: 1396 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=545876, ups=1.1, wpb=497147, bsz=16467.6, num_updates=25000, lr=0.0004, gnorm=0.184, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23083 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015 | valid on 'valid' subset | loss 3.716 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.716 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1496 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=300863, ups=0.61, wpb=494825, bsz=16903.4, num_updates=25100, lr=0.000399202, gnorm=0.183, clip=0, loss_scale=4, train_wall=93, gb_free=21.8, wall=23247 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 epoch 015: 1596 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556708, ups=1.12, wpb=496099, bsz=16564.4, num_updates=25200, lr=0.00039841, gnorm=0.191, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=23336 end of epoch 15 (average epoch stats below) epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 epoch 015 | loss 3.685 | nll_loss 2.149 | ppl 4.44 | wps 517723 | ups 1.05 | wpb 495120 | bsz 16504.8 | num_updates 25293 | lr 0.000397676 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 24.1 | wall 23418 Start iterating over samples epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 7 / 1689 loss=3.694, nll_loss=2.16, ppl=4.47, wps=551396, ups=1.12, wpb=491749, bsz=15969.3, num_updates=25300, lr=0.000397621, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=23425 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 107 / 1689 loss=3.669, nll_loss=2.13, ppl=4.38, wps=552358, ups=1.12, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=23515 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 207 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=554983, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=23605 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 308 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=550266, ups=1.11, wpb=493616, bsz=16210.8, num_updates=25600, lr=0.000395285, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=23694 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 409 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548031, ups=1.11, wpb=494907, bsz=16129.4, num_updates=25700, lr=0.000394515, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23785 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 509 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552676, ups=1.11, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.5, wall=23874 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 609 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=546129, ups=1.1, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23965 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 epoch 016: 709 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=551129, ups=1.12, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24055 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016 | valid on 'valid' subset | loss 3.704 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.704 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 809 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=452214, ups=0.91, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=24164 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 909 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=552672, ups=1.12, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=24254 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1009 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=550672, ups=1.11, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=24344 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1110 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=548995, ups=1.11, wpb=496344, bsz=16552, num_updates=26400, lr=0.000389249, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=24434 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1210 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551388, ups=1.11, wpb=495013, bsz=17078.6, num_updates=26500, lr=0.000388514, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=24524 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1310 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=549714, ups=1.11, wpb=496552, bsz=16732, num_updates=26600, lr=0.000387783, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=24614 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1410 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=552667, ups=1.12, wpb=495322, bsz=16386.6, num_updates=26700, lr=0.000387056, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=24704 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1510 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=546187, ups=1.1, wpb=495174, bsz=16525.8, num_updates=26800, lr=0.000386334, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=24794 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 epoch 016: 1610 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=552570, ups=1.11, wpb=496494, bsz=16259.4, num_updates=26900, lr=0.000385615, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=24884 end of epoch 16 (average epoch stats below) epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 epoch 016 | loss 3.676 | nll_loss 2.139 | ppl 4.4 | wps 543343 | ups 1.1 | wpb 495121 | bsz 16502 | num_updates 26979 | lr 0.00038505 | gnorm 0.178 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.1 | wall 24955 Start iterating over samples epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 epoch 017: 21 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=546350, ups=1.11, wpb=490751, bsz=16628.2, num_updates=27000, lr=0.0003849, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=24974 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.713 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.704 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 122 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=479593, ups=0.97, wpb=495970, bsz=16456.6, num_updates=27100, lr=0.000384189, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25078 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 223 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=550107, ups=1.11, wpb=495260, bsz=16379.6, num_updates=27200, lr=0.000383482, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=25168 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 323 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=550197, ups=1.11, wpb=494593, bsz=16531.8, num_updates=27300, lr=0.00038278, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=25258 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 423 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=551365, ups=1.11, wpb=496006, bsz=16594, num_updates=27400, lr=0.00038208, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=25347 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 523 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=549926, ups=1.11, wpb=494119, bsz=16364.7, num_updates=27500, lr=0.000381385, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=25437 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 623 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=545280, ups=1.1, wpb=493907, bsz=16254.6, num_updates=27600, lr=0.000380693, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=25528 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 723 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=548695, ups=1.11, wpb=495185, bsz=16334.2, num_updates=27700, lr=0.000380006, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=25618 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 823 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=550819, ups=1.11, wpb=496968, bsz=16993, num_updates=27800, lr=0.000379322, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25708 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 923 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=543910, ups=1.1, wpb=495331, bsz=16662.4, num_updates=27900, lr=0.000378641, gnorm=0.187, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=25799 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 epoch 017: 1023 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=546413, ups=1.1, wpb=494876, bsz=16405.6, num_updates=28000, lr=0.000377964, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=25890 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017 | valid on 'valid' subset | loss 3.718 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.704 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1123 / 1689 loss=3.667, nll_loss=2.129, ppl=4.37, wps=488839, ups=0.98, wpb=496574, bsz=16654.7, num_updates=28100, lr=0.000377291, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=25992 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1223 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=552342, ups=1.12, wpb=495185, bsz=16350.2, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=26081 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1323 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=549568, ups=1.11, wpb=494783, bsz=16410.8, num_updates=28300, lr=0.000375956, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=26171 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1424 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=541499, ups=1.09, wpb=495450, bsz=16524.6, num_updates=28400, lr=0.000375293, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=22.7, wall=26263 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1524 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=557355, ups=1.12, wpb=496779, bsz=16677.7, num_updates=28500, lr=0.000374634, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26352 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 epoch 017: 1624 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551109, ups=1.11, wpb=495690, bsz=16834.5, num_updates=28600, lr=0.000373979, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=26442 end of epoch 17 (average epoch stats below) epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 epoch 017 | loss 3.667 | nll_loss 2.13 | ppl 4.38 | wps 540542 | ups 1.09 | wpb 495134 | bsz 16504.6 | num_updates 28665 | lr 0.000373555 | gnorm 0.177 | clip 0 | loss_scale 2 | train_wall 1493 | gb_free 20.7 | wall 26499 Start iterating over samples epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 35 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=548121, ups=1.12, wpb=490141, bsz=16054.9, num_updates=28700, lr=0.000373327, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=26531 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 135 / 1689 loss=3.649, nll_loss=2.108, ppl=4.31, wps=549011, ups=1.11, wpb=494867, bsz=16447.4, num_updates=28800, lr=0.000372678, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=26621 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 235 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=551671, ups=1.11, wpb=496253, bsz=16335.2, num_updates=28900, lr=0.000372033, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26711 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 epoch 018: 336 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=547404, ups=1.1, wpb=495605, bsz=16539.8, num_updates=29000, lr=0.000371391, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=26802 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.704 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 436 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=492077, ups=0.99, wpb=498618, bsz=16640.6, num_updates=29100, lr=0.000370752, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=26903 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 536 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=551245, ups=1.11, wpb=495597, bsz=16463.7, num_updates=29200, lr=0.000370117, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=26993 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 636 / 1689 loss=3.657, nll_loss=2.118, ppl=4.34, wps=554680, ups=1.12, wpb=495918, bsz=16344.8, num_updates=29300, lr=0.000369484, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=27083 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 736 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547889, ups=1.11, wpb=495707, bsz=16572.6, num_updates=29400, lr=0.000368856, gnorm=0.172, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=27173 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 836 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=549245, ups=1.11, wpb=494879, bsz=16896.7, num_updates=29500, lr=0.00036823, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=27263 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 936 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=559012, ups=1.13, wpb=496095, bsz=16400.4, num_updates=29600, lr=0.000367607, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=27352 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1036 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=552842, ups=1.12, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27441 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1137 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=547058, ups=1.1, wpb=495610, bsz=16584.4, num_updates=29800, lr=0.000366372, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=27532 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1237 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555169, ups=1.12, wpb=495201, bsz=16667.8, num_updates=29900, lr=0.000365758, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=27621 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 epoch 018: 1337 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=557038, ups=1.12, wpb=495436, bsz=16288.7, num_updates=30000, lr=0.000365148, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=27710 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018 | valid on 'valid' subset | loss 3.711 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.704 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1437 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=312256, ups=0.63, wpb=495158, bsz=16344.7, num_updates=30100, lr=0.000364541, gnorm=0.177, clip=0, loss_scale=2, train_wall=135, gb_free=21.6, wall=27869 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1537 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552168, ups=1.12, wpb=495067, bsz=16318.7, num_updates=30200, lr=0.000363937, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=27958 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 epoch 018: 1637 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=546414, ups=1.11, wpb=492228, bsz=16885, num_updates=30300, lr=0.000363336, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=28048 end of epoch 18 (average epoch stats below) epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 epoch 018 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 523672 | ups 1.06 | wpb 495116 | bsz 16504.9 | num_updates 30352 | lr 0.000363025 | gnorm 0.174 | clip 0 | loss_scale 4 | train_wall 1541 | gb_free 22.6 | wall 28094 Start iterating over samples epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 48 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=543181, ups=1.1, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=28139 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 148 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=547082, ups=1.1, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=28230 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 249 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=545117, ups=1.1, wpb=496768, bsz=16574.2, num_updates=30600, lr=0.000361551, gnorm=0.178, clip=0, loss_scale=2, train_wall=90, gb_free=21.9, wall=28321 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 349 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=548975, ups=1.11, wpb=495917, bsz=16604, num_updates=30700, lr=0.000360961, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28411 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 449 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=548022, ups=1.1, wpb=495954, bsz=16096.3, num_updates=30800, lr=0.000360375, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=28502 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 549 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550997, ups=1.11, wpb=496487, bsz=16604, num_updates=30900, lr=0.000359791, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=28592 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 epoch 019: 649 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=546604, ups=1.1, wpb=495764, bsz=16927.4, num_updates=31000, lr=0.000359211, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=28682 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.703 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.703 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 749 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=403219, ups=0.81, wpb=494965, bsz=16635.5, num_updates=31100, lr=0.000358633, gnorm=0.166, clip=0, loss_scale=4, train_wall=91, gb_free=22.2, wall=28805 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 850 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=552151, ups=1.12, wpb=494794, bsz=16414.5, num_updates=31200, lr=0.000358057, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=28895 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 950 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=556623, ups=1.12, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=28984 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1050 / 1689 loss=3.655, nll_loss=2.116, ppl=4.33, wps=550531, ups=1.11, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29074 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1150 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549444, ups=1.11, wpb=493769, bsz=16390.9, num_updates=31500, lr=0.000356348, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29164 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1250 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=550793, ups=1.11, wpb=494412, bsz=16799, num_updates=31600, lr=0.000355784, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=29253 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1351 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=544068, ups=1.1, wpb=495127, bsz=16626.9, num_updates=31700, lr=0.000355222, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=29344 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1451 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=552736, ups=1.12, wpb=495268, bsz=16229.4, num_updates=31800, lr=0.000354663, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=29434 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1551 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=550760, ups=1.11, wpb=495481, bsz=16221.8, num_updates=31900, lr=0.000354107, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=29524 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 epoch 019: 1651 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=550676, ups=1.11, wpb=495015, bsz=16633.4, num_updates=32000, lr=0.000353553, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=29614 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 epoch 019 | valid on 'valid' subset | loss 3.704 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.703 end of epoch 19 (average epoch stats below) epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 epoch 019 | loss 3.652 | nll_loss 2.113 | ppl 4.33 | wps 533081 | ups 1.08 | wpb 495124 | bsz 16501.5 | num_updates 32038 | lr 0.000353344 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 23.2 | wall 29660 Start iterating over samples epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 62 / 1689 loss=3.636, nll_loss=2.094, ppl=4.27, wps=480660, ups=0.98, wpb=491002, bsz=16311.4, num_updates=32100, lr=0.000353002, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=29716 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 162 / 1689 loss=3.637, nll_loss=2.095, ppl=4.27, wps=552698, ups=1.11, wpb=495735, bsz=16171.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29806 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 263 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=546149, ups=1.1, wpb=495099, bsz=16406.7, num_updates=32300, lr=0.000351908, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29896 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 363 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=545878, ups=1.1, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=22.9, wall=29987 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 463 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=555234, ups=1.12, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=30077 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 563 / 1689 loss=3.646, nll_loss=2.106, ppl=4.3, wps=545853, ups=1.1, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=30167 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 663 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=552527, ups=1.12, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=30257 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 764 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=544989, ups=1.1, wpb=496631, bsz=16942.7, num_updates=32800, lr=0.000349215, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=16.7, wall=30348 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 864 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=543826, ups=1.1, wpb=493941, bsz=16369, num_updates=32900, lr=0.000348684, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=30439 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 epoch 020: 964 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546445, ups=1.11, wpb=494471, bsz=16450.2, num_updates=33000, lr=0.000348155, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30529 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.703 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1064 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=409604, ups=0.83, wpb=495238, bsz=16383, num_updates=33100, lr=0.000347629, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30650 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1164 / 1689 loss=3.642, nll_loss=2.102, ppl=4.29, wps=554463, ups=1.12, wpb=494477, bsz=16596.2, num_updates=33200, lr=0.000347105, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=30739 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1264 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=554232, ups=1.12, wpb=495162, bsz=16686.8, num_updates=33300, lr=0.000346583, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=30829 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1365 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=545638, ups=1.1, wpb=494844, bsz=16334.9, num_updates=33400, lr=0.000346064, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=30920 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1466 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=546076, ups=1.1, wpb=494764, bsz=16560.2, num_updates=33500, lr=0.000345547, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=31010 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1566 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=551995, ups=1.11, wpb=495172, bsz=16230.2, num_updates=33600, lr=0.000345033, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31100 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 epoch 020: 1666 / 1689 loss=3.65, nll_loss=2.111, ppl=4.32, wps=553795, ups=1.12, wpb=496548, bsz=16557.6, num_updates=33700, lr=0.00034452, gnorm=0.166, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=31189 end of epoch 20 (average epoch stats below) epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 epoch 020 | loss 3.646 | nll_loss 2.106 | ppl 4.3 | wps 538464 | ups 1.09 | wpb 495116 | bsz 16504.5 | num_updates 33723 | lr 0.000344403 | gnorm 0.171 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.5 | wall 31209 Start iterating over samples epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 77 / 1689 loss=3.631, nll_loss=2.089, ppl=4.25, wps=552342, ups=1.12, wpb=491672, bsz=16281.6, num_updates=33800, lr=0.00034401, gnorm=0.175, clip=0, loss_scale=1, train_wall=86, gb_free=21, wall=31279 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 177 / 1689 loss=3.632, nll_loss=2.09, ppl=4.26, wps=554481, ups=1.12, wpb=495415, bsz=16421.1, num_updates=33900, lr=0.000343503, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31368 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 epoch 021: 277 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=552121, ups=1.11, wpb=495720, bsz=16546.2, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=31458 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.702 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.702 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 377 / 1689 loss=3.637, nll_loss=2.096, ppl=4.27, wps=454967, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=31566 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 477 / 1689 loss=3.638, nll_loss=2.097, ppl=4.28, wps=551467, ups=1.11, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=31656 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 577 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=553209, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=31745 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 678 / 1689 loss=3.642, nll_loss=2.101, ppl=4.29, wps=543765, ups=1.1, wpb=495571, bsz=16509.5, num_updates=34400, lr=0.000340997, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=31837 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 778 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=548406, ups=1.11, wpb=494386, bsz=16662.9, num_updates=34500, lr=0.000340503, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=31927 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 878 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=547797, ups=1.1, wpb=495975, bsz=16248, num_updates=34600, lr=0.00034001, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32017 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 978 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=549074, ups=1.11, wpb=496181, bsz=16363.7, num_updates=34700, lr=0.00033952, gnorm=0.176, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=32108 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1078 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=549920, ups=1.11, wpb=495093, bsz=16373, num_updates=34800, lr=0.000339032, gnorm=0.172, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=32198 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1178 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=550655, ups=1.11, wpb=495473, bsz=16707.6, num_updates=34900, lr=0.000338546, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=32288 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 epoch 021: 1278 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=549690, ups=1.11, wpb=494730, bsz=16686.5, num_updates=35000, lr=0.000338062, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=32378 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021 | valid on 'valid' subset | loss 3.704 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.702 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1378 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=311918, ups=0.63, wpb=495137, bsz=16127.8, num_updates=35100, lr=0.00033758, gnorm=0.176, clip=0, loss_scale=2, train_wall=139, gb_free=21, wall=32536 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1478 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551935, ups=1.11, wpb=496964, bsz=16400.2, num_updates=35200, lr=0.0003371, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.9, wall=32626 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1579 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=545433, ups=1.1, wpb=496666, bsz=16443.2, num_updates=35300, lr=0.000336622, gnorm=0.167, clip=0, loss_scale=1, train_wall=90, gb_free=22, wall=32717 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 epoch 021: 1679 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=552009, ups=1.12, wpb=494724, bsz=16523.2, num_updates=35400, lr=0.000336146, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=32807 end of epoch 21 (average epoch stats below) epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 epoch 021 | loss 3.64 | nll_loss 2.099 | ppl 4.29 | wps 520171 | ups 1.05 | wpb 495124 | bsz 16509.6 | num_updates 35410 | lr 0.000336099 | gnorm 0.17 | clip 0 | loss_scale 1 | train_wall 1543 | gb_free 23.8 | wall 32815 Start iterating over samples epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 90 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=538627, ups=1.1, wpb=490968, bsz=16514.1, num_updates=35500, lr=0.000335673, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=32898 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 190 / 1689 loss=3.619, nll_loss=2.076, ppl=4.22, wps=546471, ups=1.11, wpb=493898, bsz=16123.3, num_updates=35600, lr=0.000335201, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=32989 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 290 / 1689 loss=3.626, nll_loss=2.084, ppl=4.24, wps=555823, ups=1.12, wpb=494179, bsz=16647.7, num_updates=35700, lr=0.000334731, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33078 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 390 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=550015, ups=1.11, wpb=495523, bsz=16779.8, num_updates=35800, lr=0.000334263, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=33168 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 490 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=543607, ups=1.1, wpb=495534, bsz=16562.5, num_updates=35900, lr=0.000333797, gnorm=0.165, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=33259 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 epoch 022: 590 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=548395, ups=1.11, wpb=495823, bsz=16580.2, num_updates=36000, lr=0.000333333, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=33349 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022 | valid on 'valid' subset | loss 3.693 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.693 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 690 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=463192, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=33456 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 790 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=550420, ups=1.11, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=33546 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 890 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=546944, ups=1.11, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=33637 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 990 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=555246, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=33726 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1091 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=548024, ups=1.11, wpb=495044, bsz=16552.5, num_updates=36500, lr=0.000331042, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=19.5, wall=33816 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1191 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=551933, ups=1.11, wpb=496046, bsz=16388.2, num_updates=36600, lr=0.00033059, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=33906 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1291 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=553823, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=33996 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1391 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=547774, ups=1.1, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=34086 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1492 / 1689 loss=3.64, nll_loss=2.1, ppl=4.29, wps=546800, ups=1.1, wpb=495480, bsz=16265.5, num_updates=36900, lr=0.000329243, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=34177 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 epoch 022: 1592 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=549801, ups=1.11, wpb=495270, bsz=16576.4, num_updates=37000, lr=0.000328798, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=34267 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 epoch 022 | valid on 'valid' subset | loss 3.692 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.692 end of epoch 22 (average epoch stats below) epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 epoch 022 | loss 3.634 | nll_loss 2.093 | ppl 4.27 | wps 536344 | ups 1.08 | wpb 495145 | bsz 16503.3 | num_updates 37097 | lr 0.000328368 | gnorm 0.169 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 21.9 | wall 34373 Start iterating over samples epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 3 / 1689 loss=3.638, nll_loss=2.098, ppl=4.28, wps=452356, ups=0.92, wpb=491692, bsz=16391.3, num_updates=37100, lr=0.000328355, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=34376 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 103 / 1689 loss=3.613, nll_loss=2.069, ppl=4.19, wps=550039, ups=1.11, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=34466 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 203 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=555700, ups=1.12, wpb=495036, bsz=17039.4, num_updates=37300, lr=0.000327473, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34555 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 303 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=553532, ups=1.11, wpb=496937, bsz=16699.8, num_updates=37400, lr=0.000327035, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.1, wall=34645 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 403 / 1689 loss=3.628, nll_loss=2.086, ppl=4.25, wps=554901, ups=1.12, wpb=495736, bsz=16631.8, num_updates=37500, lr=0.000326599, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=34734 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 503 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=552620, ups=1.12, wpb=495584, bsz=16722.6, num_updates=37600, lr=0.000326164, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34824 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 603 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=553600, ups=1.12, wpb=494395, bsz=16210.4, num_updates=37700, lr=0.000325731, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34913 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 703 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=549523, ups=1.11, wpb=494606, bsz=16542.6, num_updates=37800, lr=0.0003253, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=35003 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 803 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554772, ups=1.12, wpb=495916, bsz=16497.4, num_updates=37900, lr=0.000324871, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=35092 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 epoch 023: 904 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=550860, ups=1.11, wpb=495112, bsz=16851.1, num_updates=38000, lr=0.000324443, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=35182 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023 | valid on 'valid' subset | loss 3.694 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.692 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1004 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=493038, ups=1, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35283 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1104 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=555218, ups=1.12, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.174, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=35372 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1204 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=555588, ups=1.12, wpb=495937, bsz=16195.2, num_updates=38300, lr=0.00032317, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=35461 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1304 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=553648, ups=1.12, wpb=495088, bsz=16557.8, num_updates=38400, lr=0.000322749, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=35551 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1404 / 1689 loss=3.63, nll_loss=2.089, ppl=4.25, wps=555701, ups=1.12, wpb=494603, bsz=16460, num_updates=38500, lr=0.000322329, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=20.2, wall=35640 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1505 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=542546, ups=1.1, wpb=494777, bsz=16534, num_updates=38600, lr=0.000321911, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=35731 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 epoch 023: 1605 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=548738, ups=1.11, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=35821 end of epoch 23 (average epoch stats below) epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 epoch 023 | loss 3.629 | nll_loss 2.087 | ppl 4.25 | wps 548432 | ups 1.11 | wpb 495119 | bsz 16503.7 | num_updates 38784 | lr 0.000321147 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 23.4 | wall 35896 Start iterating over samples epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 16 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=549328, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=35911 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 116 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=551876, ups=1.11, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=36001 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 epoch 024: 216 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=547703, ups=1.1, wpb=495898, bsz=16792.2, num_updates=39000, lr=0.000320256, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=36091 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.698 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.692 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 317 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=480934, ups=0.97, wpb=495631, bsz=16255.4, num_updates=39100, lr=0.000319847, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36194 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 417 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=551819, ups=1.12, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=36284 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 517 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=553297, ups=1.11, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=36374 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 617 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=547895, ups=1.11, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=36464 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 717 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=551313, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=36554 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 818 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=543988, ups=1.1, wpb=496559, bsz=16352.6, num_updates=39600, lr=0.000317821, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=36645 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 918 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552300, ups=1.11, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=36735 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1018 / 1689 loss=3.627, nll_loss=2.086, ppl=4.24, wps=556100, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.155, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=36824 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1118 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=549124, ups=1.11, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=36914 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 epoch 024: 1218 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=548606, ups=1.11, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=37004 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.692 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1318 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=445178, ups=0.9, wpb=494975, bsz=16478.1, num_updates=40100, lr=0.000315833, gnorm=0.167, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=37115 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1419 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548409, ups=1.11, wpb=494307, bsz=16261, num_updates=40200, lr=0.00031544, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=37206 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1519 / 1689 loss=3.634, nll_loss=2.094, ppl=4.27, wps=554935, ups=1.12, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=37295 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 epoch 024: 1619 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554591, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37384 end of epoch 24 (average epoch stats below) epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 epoch 024 | loss 3.624 | nll_loss 2.082 | ppl 4.23 | wps 538370 | ups 1.09 | wpb 495122 | bsz 16508.9 | num_updates 40470 | lr 0.000314386 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1500 | gb_free 22.9 | wall 37446 Start iterating over samples epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 30 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=547887, ups=1.12, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=37474 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 130 / 1689 loss=3.611, nll_loss=2.067, ppl=4.19, wps=552372, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=37563 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 231 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=549218, ups=1.11, wpb=496734, bsz=16483.3, num_updates=40700, lr=0.000313497, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=37654 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 331 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=550703, ups=1.11, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=37744 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 431 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=556438, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=37833 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 epoch 025: 531 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=554101, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=37922 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.689 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.689 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 631 / 1689 loss=3.622, nll_loss=2.079, ppl=4.23, wps=189048, ups=0.38, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.162, clip=0, loss_scale=2, train_wall=205, gb_free=21.9, wall=38184 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 732 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=553312, ups=1.12, wpb=495375, bsz=16641.4, num_updates=41200, lr=0.000311588, gnorm=0.166, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=38274 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 832 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=552547, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=38363 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 932 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554101, ups=1.12, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=38453 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1032 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=551221, ups=1.12, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=38542 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1133 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=553523, ups=1.12, wpb=494866, bsz=16108.6, num_updates=41600, lr=0.000310087, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=38632 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1233 / 1689 loss=3.623, nll_loss=2.081, ppl=4.23, wps=550608, ups=1.11, wpb=495260, bsz=16408.2, num_updates=41700, lr=0.000309715, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=38722 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1333 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=556291, ups=1.12, wpb=496890, bsz=16622, num_updates=41800, lr=0.000309344, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=38811 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1433 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=553288, ups=1.12, wpb=495445, bsz=16481.3, num_updates=41900, lr=0.000308975, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.8, wall=38901 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 epoch 025: 1533 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553883, ups=1.12, wpb=494153, bsz=16772.6, num_updates=42000, lr=0.000308607, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38990 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.689 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 epoch 025: 1633 / 1689 loss=3.621, nll_loss=2.079, ppl=4.22, wps=491488, ups=0.99, wpb=495367, bsz=16379.8, num_updates=42100, lr=0.00030824, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=39091 end of epoch 25 (average epoch stats below) epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 epoch 025 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 492851 | ups 1 | wpb 495132 | bsz 16498 | num_updates 42156 | lr 0.000308035 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 23.6 | wall 39140 Start iterating over samples epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 45 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=542777, ups=1.1, wpb=491780, bsz=16480.3, num_updates=42200, lr=0.000307875, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=39181 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 145 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=557014, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39270 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 245 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=551076, ups=1.11, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=39360 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 345 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=550128, ups=1.11, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=39450 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 445 / 1689 loss=3.61, nll_loss=2.066, ppl=4.19, wps=551867, ups=1.12, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=39540 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 545 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=548451, ups=1.11, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=39630 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 646 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=550270, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=20.3, wall=39720 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 746 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=549930, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.175, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=39810 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 epoch 026: 846 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548346, ups=1.11, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=39901 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026 | valid on 'valid' subset | loss 3.686 | nll_loss 2.128 | ppl 4.37 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.686 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 946 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=452869, ups=0.91, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=40010 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1046 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=549688, ups=1.11, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40100 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1146 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=550979, ups=1.11, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40190 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1246 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=557534, ups=1.12, wpb=495882, bsz=16221.9, num_updates=43400, lr=0.000303588, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=40279 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1346 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=553009, ups=1.12, wpb=495023, bsz=16453, num_updates=43500, lr=0.000303239, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=40369 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1446 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=553248, ups=1.12, wpb=496010, bsz=16645.4, num_updates=43600, lr=0.000302891, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=40458 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1546 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=548195, ups=1.11, wpb=494881, bsz=16423.7, num_updates=43700, lr=0.000302545, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=40549 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 epoch 026: 1646 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=548359, ups=1.11, wpb=495054, bsz=16412.1, num_updates=43800, lr=0.000302199, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=40639 end of epoch 26 (average epoch stats below) epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 epoch 026 | loss 3.614 | nll_loss 2.071 | ppl 4.2 | wps 543068 | ups 1.1 | wpb 495119 | bsz 16507 | num_updates 43842 | lr 0.000302054 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 22.8 | wall 40677 Start iterating over samples epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 58 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=532544, ups=1.09, wpb=490512, bsz=16018.7, num_updates=43900, lr=0.000301855, gnorm=0.154, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=40731 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 epoch 027: 158 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=547339, ups=1.1, wpb=495568, bsz=16518.8, num_updates=44000, lr=0.000301511, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=40821 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027 | valid on 'valid' subset | loss 3.686 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.686 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 258 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=449729, ups=0.91, wpb=495922, bsz=16109.4, num_updates=44100, lr=0.000301169, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40932 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 358 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=547466, ups=1.1, wpb=495788, bsz=16514.6, num_updates=44200, lr=0.000300828, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=41022 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 458 / 1689 loss=3.608, nll_loss=2.064, ppl=4.18, wps=557404, ups=1.12, wpb=496217, bsz=16313.8, num_updates=44300, lr=0.000300489, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41111 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 559 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=548735, ups=1.11, wpb=496592, bsz=16303.1, num_updates=44400, lr=0.00030015, gnorm=0.16, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=41202 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 659 / 1689 loss=3.607, nll_loss=2.063, ppl=4.18, wps=549743, ups=1.11, wpb=493376, bsz=16730.2, num_updates=44500, lr=0.000299813, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=41292 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 759 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=543718, ups=1.1, wpb=494238, bsz=16785.8, num_updates=44600, lr=0.000299476, gnorm=0.169, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=41382 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 859 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=549979, ups=1.11, wpb=494483, bsz=16667.9, num_updates=44700, lr=0.000299141, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=41472 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 959 / 1689 loss=3.613, nll_loss=2.07, ppl=4.2, wps=550054, ups=1.11, wpb=495283, bsz=16250.5, num_updates=44800, lr=0.000298807, gnorm=0.158, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=41562 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1059 / 1689 loss=3.615, nll_loss=2.072, ppl=4.2, wps=549301, ups=1.11, wpb=494132, bsz=16846.2, num_updates=44900, lr=0.000298474, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=41652 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 epoch 027: 1159 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=550570, ups=1.11, wpb=495243, bsz=16942.9, num_updates=45000, lr=0.000298142, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=41742 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027 | valid on 'valid' subset | loss 3.675 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.675 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1259 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=391435, ups=0.79, wpb=495840, bsz=16404.9, num_updates=45100, lr=0.000297812, gnorm=0.164, clip=0, loss_scale=2, train_wall=93, gb_free=21.9, wall=41869 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1359 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=559363, ups=1.13, wpb=495692, bsz=16399.3, num_updates=45200, lr=0.000297482, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=41958 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1459 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=557602, ups=1.12, wpb=495992, bsz=16518.2, num_updates=45300, lr=0.000297154, gnorm=0.158, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=42047 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1560 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552277, ups=1.11, wpb=495377, bsz=16345, num_updates=45400, lr=0.000296826, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=42136 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 epoch 027: 1661 / 1689 loss=3.615, nll_loss=2.072, ppl=4.21, wps=547239, ups=1.1, wpb=495632, bsz=16832, num_updates=45500, lr=0.0002965, gnorm=0.163, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=42227 end of epoch 27 (average epoch stats below) epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 epoch 027 | loss 3.61 | nll_loss 2.067 | ppl 4.19 | wps 530334 | ups 1.07 | wpb 495110 | bsz 16506.6 | num_updates 45528 | lr 0.000296409 | gnorm 0.163 | clip 0 | loss_scale 1 | train_wall 1501 | gb_free 23.1 | wall 42251 Start iterating over samples epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 72 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=541763, ups=1.1, wpb=490706, bsz=16537, num_updates=45600, lr=0.000296174, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=42317 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 172 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=555369, ups=1.12, wpb=497853, bsz=16387.4, num_updates=45700, lr=0.00029585, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.7, wall=42407 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 272 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=552505, ups=1.12, wpb=494754, bsz=16510.1, num_updates=45800, lr=0.000295527, gnorm=0.168, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=42497 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 372 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=551817, ups=1.12, wpb=494347, bsz=16558, num_updates=45900, lr=0.000295205, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=19.9, wall=42586 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 epoch 028: 472 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=545015, ups=1.1, wpb=496562, bsz=16736.2, num_updates=46000, lr=0.000294884, gnorm=0.164, clip=0, loss_scale=2, train_wall=90, gb_free=20.1, wall=42677 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.675 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 572 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=436793, ups=0.88, wpb=494213, bsz=17086.6, num_updates=46100, lr=0.000294564, gnorm=0.162, clip=0, loss_scale=2, train_wall=96, gb_free=20.8, wall=42791 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 672 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=551018, ups=1.11, wpb=495836, bsz=16464.5, num_updates=46200, lr=0.000294245, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=20.5, wall=42881 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 772 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=545550, ups=1.1, wpb=496542, bsz=16452.6, num_updates=46300, lr=0.000293927, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=42972 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 872 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=549086, ups=1.11, wpb=496256, bsz=16380.6, num_updates=46400, lr=0.00029361, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=43062 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 973 / 1689 loss=3.612, nll_loss=2.069, ppl=4.2, wps=544590, ups=1.1, wpb=495507, bsz=16478.4, num_updates=46500, lr=0.000293294, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=43153 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1073 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=548170, ups=1.11, wpb=495607, bsz=16216, num_updates=46600, lr=0.000292979, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=43243 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1173 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=550742, ups=1.11, wpb=497060, bsz=16281.1, num_updates=46700, lr=0.000292666, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=43334 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1273 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=550796, ups=1.11, wpb=494667, bsz=16337.4, num_updates=46800, lr=0.000292353, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43423 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1373 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=549284, ups=1.11, wpb=496681, bsz=16606.9, num_updates=46900, lr=0.000292041, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=43514 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 epoch 028: 1473 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=548072, ups=1.11, wpb=494584, bsz=16508.2, num_updates=47000, lr=0.00029173, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43604 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028 | valid on 'valid' subset | loss 3.689 | nll_loss 2.13 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.675 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1573 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=387353, ups=0.78, wpb=494072, bsz=16454.6, num_updates=47100, lr=0.00029142, gnorm=0.167, clip=0, loss_scale=2, train_wall=97, gb_free=21.3, wall=43732 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 epoch 028: 1673 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=556306, ups=1.13, wpb=492936, bsz=16809.1, num_updates=47200, lr=0.000291111, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=43820 end of epoch 28 (average epoch stats below) epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 epoch 028 | loss 3.606 | nll_loss 2.062 | ppl 4.18 | wps 528081 | ups 1.07 | wpb 495126 | bsz 16506 | num_updates 47216 | lr 0.000291062 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 1511 | gb_free 22.4 | wall 43834 Start iterating over samples epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 84 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=551410, ups=1.12, wpb=491052, bsz=16445.8, num_updates=47300, lr=0.000290803, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=43909 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 184 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=555095, ups=1.12, wpb=495134, bsz=16877.7, num_updates=47400, lr=0.000290496, gnorm=0.152, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=43998 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 285 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=548222, ups=1.11, wpb=495251, bsz=16240.8, num_updates=47500, lr=0.000290191, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44089 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 385 / 1689 loss=3.6, nll_loss=2.055, ppl=4.16, wps=549064, ups=1.11, wpb=494024, bsz=16035.2, num_updates=47600, lr=0.000289886, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44179 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 485 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=550936, ups=1.11, wpb=495622, bsz=16748.3, num_updates=47700, lr=0.000289581, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=44269 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 585 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=548141, ups=1.11, wpb=495093, bsz=16816.4, num_updates=47800, lr=0.000289278, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=44359 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 685 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551754, ups=1.11, wpb=496890, bsz=16569.5, num_updates=47900, lr=0.000288976, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=44449 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 epoch 029: 786 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=542827, ups=1.09, wpb=495966, bsz=16345.4, num_updates=48000, lr=0.000288675, gnorm=0.158, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=44541 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029 | valid on 'valid' subset | loss 3.703 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.675 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 886 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=481184, ups=0.98, wpb=493397, bsz=16704.8, num_updates=48100, lr=0.000288375, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=44643 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 987 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=546646, ups=1.11, wpb=493038, bsz=16489.1, num_updates=48200, lr=0.000288076, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=44733 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1087 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=555520, ups=1.12, wpb=496382, bsz=16395.8, num_updates=48300, lr=0.000287777, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=44823 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1187 / 1689 loss=3.604, nll_loss=2.06, ppl=4.17, wps=555124, ups=1.12, wpb=496354, bsz=16757.9, num_updates=48400, lr=0.00028748, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=44912 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1287 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=554218, ups=1.12, wpb=496042, bsz=16553.2, num_updates=48500, lr=0.000287183, gnorm=0.16, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45002 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1387 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=555562, ups=1.12, wpb=496637, bsz=16498.3, num_updates=48600, lr=0.000286888, gnorm=0.157, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45091 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1487 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=551561, ups=1.12, wpb=494586, bsz=16412.4, num_updates=48700, lr=0.000286593, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=45181 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1588 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=547644, ups=1.11, wpb=494316, bsz=16236.1, num_updates=48800, lr=0.000286299, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=45271 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 epoch 029: 1688 / 1689 loss=3.608, nll_loss=2.066, ppl=4.19, wps=555246, ups=1.12, wpb=496353, bsz=16272.2, num_updates=48900, lr=0.000286006, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=45360 end of epoch 29 (average epoch stats below) epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 epoch 029 | loss 3.602 | nll_loss 2.058 | ppl 4.16 | wps 546438 | ups 1.1 | wpb 495114 | bsz 16497.3 | num_updates 48901 | lr 0.000286003 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1487 | gb_free 25.7 | wall 45360 Start iterating over samples epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 epoch 030: 99 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=543814, ups=1.11, wpb=491177, bsz=16451.8, num_updates=49000, lr=0.000285714, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=45451 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.682 | nll_loss 2.121 | ppl 4.35 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.675 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 199 / 1689 loss=3.586, nll_loss=2.039, ppl=4.11, wps=485029, ups=0.98, wpb=495452, bsz=16212.6, num_updates=49100, lr=0.000285423, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45553 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 299 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=549923, ups=1.11, wpb=495851, bsz=16249, num_updates=49200, lr=0.000285133, gnorm=0.17, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45643 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 399 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=549976, ups=1.11, wpb=494905, bsz=16506, num_updates=49300, lr=0.000284844, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=45733 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 499 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=554940, ups=1.12, wpb=496335, bsz=16781.5, num_updates=49400, lr=0.000284555, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=45822 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 600 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=552624, ups=1.11, wpb=496219, bsz=16463.4, num_updates=49500, lr=0.000284268, gnorm=0.168, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=45912 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 700 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=557327, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=46001 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 800 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=559947, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=46090 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 900 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553479, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=46179 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1000 / 1689 loss=3.594, nll_loss=2.05, ppl=4.14, wps=555259, ups=1.12, wpb=494874, bsz=16581.8, num_updates=49900, lr=0.000283126, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=20.4, wall=46268 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 epoch 030: 1100 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=551777, ups=1.11, wpb=494893, bsz=16594.3, num_updates=50000, lr=0.000282843, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=46358 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030 | valid on 'valid' subset | loss 3.681 | nll_loss 2.124 | ppl 4.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.675 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1200 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=486786, ups=0.98, wpb=494233, bsz=16908.8, num_updates=50100, lr=0.00028256, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=46459 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1300 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=551712, ups=1.12, wpb=494285, bsz=16314.4, num_updates=50200, lr=0.000282279, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=46549 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1401 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=541264, ups=1.09, wpb=495524, bsz=16623.1, num_updates=50300, lr=0.000281998, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=46641 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1501 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=545757, ups=1.1, wpb=495914, bsz=16674.2, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=46731 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 epoch 030: 1601 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=550501, ups=1.11, wpb=496278, bsz=16505, num_updates=50500, lr=0.000281439, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=20.7, wall=46822 end of epoch 30 (average epoch stats below) epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 epoch 030 | loss 3.599 | nll_loss 2.054 | ppl 4.15 | wps 542569 | ups 1.1 | wpb 495119 | bsz 16503.1 | num_updates 50588 | lr 0.000281194 | gnorm 0.165 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 23.6 | wall 46900 Start iterating over samples epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 12 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=534307, ups=1.09, wpb=490805, bsz=16339.3, num_updates=50600, lr=0.000281161, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=46913 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 112 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=553666, ups=1.12, wpb=494915, bsz=16416.8, num_updates=50700, lr=0.000280883, gnorm=0.159, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=47003 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 212 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=551323, ups=1.11, wpb=495137, bsz=16674.5, num_updates=50800, lr=0.000280607, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47093 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 312 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=552449, ups=1.12, wpb=495407, bsz=16221.7, num_updates=50900, lr=0.000280331, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=47182 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 epoch 031: 412 / 1689 loss=3.597, nll_loss=2.052, ppl=4.15, wps=545192, ups=1.1, wpb=493508, bsz=16418.4, num_updates=51000, lr=0.000280056, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=47273 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.691 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.675 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 512 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=381879, ups=0.77, wpb=494950, bsz=16806.6, num_updates=51100, lr=0.000279782, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21, wall=47402 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 612 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=558841, ups=1.13, wpb=495035, bsz=16520.3, num_updates=51200, lr=0.000279508, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=47491 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 714 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=546130, ups=1.1, wpb=496262, bsz=16671, num_updates=51300, lr=0.000279236, gnorm=0.166, clip=0, loss_scale=1, train_wall=90, gb_free=21.1, wall=47582 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 814 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=557350, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=47671 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 914 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=556956, ups=1.12, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=47760 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1014 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=553855, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47849 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1114 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=557024, ups=1.12, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=47938 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1214 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=551808, ups=1.12, wpb=493685, bsz=16732.2, num_updates=51800, lr=0.000277885, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=48028 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1314 / 1689 loss=3.605, nll_loss=2.061, ppl=4.17, wps=554224, ups=1.12, wpb=495414, bsz=16293.6, num_updates=51900, lr=0.000277617, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=48117 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 epoch 031: 1414 / 1689 loss=3.598, nll_loss=2.054, ppl=4.15, wps=550281, ups=1.11, wpb=495071, bsz=16884.5, num_updates=52000, lr=0.00027735, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=48207 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031 | valid on 'valid' subset | loss 3.676 | nll_loss 2.116 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.675 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1514 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=480739, ups=0.97, wpb=496809, bsz=16829.7, num_updates=52100, lr=0.000277084, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=48311 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 epoch 031: 1614 / 1689 loss=3.599, nll_loss=2.055, ppl=4.16, wps=550166, ups=1.11, wpb=495410, bsz=16338.1, num_updates=52200, lr=0.000276818, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=48401 end of epoch 31 (average epoch stats below) epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 epoch 031 | loss 3.595 | nll_loss 2.05 | ppl 4.14 | wps 532784 | ups 1.08 | wpb 495122 | bsz 16508.2 | num_updates 52274 | lr 0.000276622 | gnorm 0.162 | clip 0 | loss_scale 1 | train_wall 1489 | gb_free 22.7 | wall 48467 Start iterating over samples epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 26 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=545668, ups=1.11, wpb=492815, bsz=16619, num_updates=52300, lr=0.000276553, gnorm=0.159, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=48491 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 126 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=550263, ups=1.11, wpb=494606, bsz=16728.9, num_updates=52400, lr=0.000276289, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=48581 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 226 / 1689 loss=3.583, nll_loss=2.036, ppl=4.1, wps=547246, ups=1.11, wpb=494388, bsz=16597, num_updates=52500, lr=0.000276026, gnorm=0.155, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48671 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 326 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=550968, ups=1.11, wpb=496121, bsz=16508.9, num_updates=52600, lr=0.000275764, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=48761 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 426 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=549516, ups=1.11, wpb=496786, bsz=16582.5, num_updates=52700, lr=0.000275502, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=48852 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 526 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=546435, ups=1.1, wpb=494746, bsz=16806.2, num_updates=52800, lr=0.000275241, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=48942 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 626 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=546284, ups=1.1, wpb=495925, bsz=16486.6, num_updates=52900, lr=0.000274981, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49033 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 epoch 032: 726 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=544844, ups=1.1, wpb=493742, bsz=16564.1, num_updates=53000, lr=0.000274721, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=49124 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032 | valid on 'valid' subset | loss 3.688 | nll_loss 2.135 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.675 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 826 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=382112, ups=0.77, wpb=495355, bsz=16072.7, num_updates=53100, lr=0.000274462, gnorm=0.157, clip=0, loss_scale=2, train_wall=109, gb_free=21.6, wall=49253 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 927 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=546996, ups=1.1, wpb=495774, bsz=16494.3, num_updates=53200, lr=0.000274204, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=49344 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1027 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=553904, ups=1.12, wpb=496277, bsz=16083.8, num_updates=53300, lr=0.000273947, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=49433 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1127 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=545715, ups=1.1, wpb=494436, bsz=16456.7, num_updates=53400, lr=0.00027369, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=49524 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1227 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=553642, ups=1.12, wpb=494634, bsz=16220.2, num_updates=53500, lr=0.000273434, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=49613 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1327 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=550746, ups=1.11, wpb=496465, bsz=16180.3, num_updates=53600, lr=0.000273179, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=49704 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1427 / 1689 loss=3.603, nll_loss=2.059, ppl=4.17, wps=547402, ups=1.11, wpb=494048, bsz=16755.7, num_updates=53700, lr=0.000272925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=49794 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1527 / 1689 loss=3.596, nll_loss=2.052, ppl=4.15, wps=545991, ups=1.1, wpb=497168, bsz=16699.8, num_updates=53800, lr=0.000272671, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=49885 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 epoch 032: 1627 / 1689 loss=3.601, nll_loss=2.058, ppl=4.16, wps=545244, ups=1.1, wpb=494444, bsz=16512.6, num_updates=53900, lr=0.000272418, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=49976 end of epoch 32 (average epoch stats below) epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 epoch 032 | loss 3.592 | nll_loss 2.047 | ppl 4.13 | wps 534365 | ups 1.08 | wpb 495108 | bsz 16506.8 | num_updates 53962 | lr 0.000272261 | gnorm 0.162 | clip 0 | loss_scale 2 | train_wall 1519 | gb_free 22.5 | wall 50031 Start iterating over samples epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 epoch 033: 39 / 1689 loss=3.596, nll_loss=2.051, ppl=4.14, wps=540892, ups=1.1, wpb=491713, bsz=16489.9, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=50066 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.663 | nll_loss 2.104 | ppl 4.3 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.663 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 139 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=458490, ups=0.93, wpb=495101, bsz=16534.7, num_updates=54100, lr=0.000271914, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50174 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 239 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=552358, ups=1.11, wpb=497003, bsz=17004.9, num_updates=54200, lr=0.000271663, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=50264 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 339 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=548051, ups=1.11, wpb=493958, bsz=16279.4, num_updates=54300, lr=0.000271413, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=50355 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 439 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=547612, ups=1.11, wpb=494606, bsz=16431.9, num_updates=54400, lr=0.000271163, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=50445 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 539 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=546958, ups=1.1, wpb=496249, bsz=16653.4, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=50536 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 639 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=544977, ups=1.1, wpb=495279, bsz=16748.6, num_updates=54600, lr=0.000270666, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=50626 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 739 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=549834, ups=1.11, wpb=493969, bsz=16265.9, num_updates=54700, lr=0.000270418, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=50716 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 839 / 1689 loss=3.592, nll_loss=2.048, ppl=4.13, wps=551495, ups=1.12, wpb=494433, bsz=16742.2, num_updates=54800, lr=0.000270172, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=50806 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 939 / 1689 loss=3.584, nll_loss=2.037, ppl=4.1, wps=553890, ups=1.12, wpb=496228, bsz=15993.4, num_updates=54900, lr=0.000269925, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=50896 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 epoch 033: 1040 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=542917, ups=1.1, wpb=495236, bsz=16531, num_updates=55000, lr=0.00026968, gnorm=0.161, clip=0, loss_scale=2, train_wall=90, gb_free=20.3, wall=50987 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033 | valid on 'valid' subset | loss 3.674 | nll_loss 2.119 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.663 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1140 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=489691, ups=0.99, wpb=494265, bsz=16463.3, num_updates=55100, lr=0.000269435, gnorm=0.156, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=51088 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1240 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=551734, ups=1.11, wpb=495934, bsz=17075.5, num_updates=55200, lr=0.000269191, gnorm=0.161, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51178 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1340 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=555305, ups=1.12, wpb=496155, bsz=16304.6, num_updates=55300, lr=0.000268947, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=51267 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1440 / 1689 loss=3.6, nll_loss=2.057, ppl=4.16, wps=554308, ups=1.12, wpb=496209, bsz=16818.6, num_updates=55400, lr=0.000268705, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51356 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1541 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=543856, ups=1.1, wpb=495318, bsz=16632.1, num_updates=55500, lr=0.000268462, gnorm=0.165, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=51448 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 epoch 033: 1641 / 1689 loss=3.595, nll_loss=2.051, ppl=4.14, wps=550259, ups=1.11, wpb=495372, bsz=16055.3, num_updates=55600, lr=0.000268221, gnorm=0.152, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51538 end of epoch 33 (average epoch stats below) epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 epoch 033 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 538926 | ups 1.09 | wpb 495120 | bsz 16504.1 | num_updates 55648 | lr 0.000268105 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 23.9 | wall 51580 Start iterating over samples epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 52 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=538967, ups=1.1, wpb=491120, bsz=16326.9, num_updates=55700, lr=0.00026798, gnorm=0.164, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=51629 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 152 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=549071, ups=1.11, wpb=496327, bsz=16808.4, num_updates=55800, lr=0.00026774, gnorm=0.16, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=51719 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 252 / 1689 loss=3.574, nll_loss=2.026, ppl=4.07, wps=553270, ups=1.11, wpb=496293, bsz=16057.4, num_updates=55900, lr=0.0002675, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=51809 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 epoch 034: 352 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=548331, ups=1.11, wpb=495297, bsz=16371.4, num_updates=56000, lr=0.000267261, gnorm=0.151, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51899 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.67 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.663 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 452 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=351869, ups=0.71, wpb=494345, bsz=16546.5, num_updates=56100, lr=0.000267023, gnorm=0.157, clip=0, loss_scale=2, train_wall=122, gb_free=21.8, wall=52040 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 552 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=554302, ups=1.12, wpb=496132, bsz=16462, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=52129 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 652 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=551194, ups=1.11, wpb=494638, bsz=16709.8, num_updates=56300, lr=0.000266548, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=52219 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 753 / 1689 loss=3.584, nll_loss=2.039, ppl=4.11, wps=547499, ups=1.1, wpb=496710, bsz=16658.8, num_updates=56400, lr=0.000266312, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=52310 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 853 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=548258, ups=1.11, wpb=494671, bsz=16460, num_updates=56500, lr=0.000266076, gnorm=0.155, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=52400 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 953 / 1689 loss=3.588, nll_loss=2.042, ppl=4.12, wps=546442, ups=1.11, wpb=493546, bsz=16621.1, num_updates=56600, lr=0.000265841, gnorm=0.161, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=52490 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1053 / 1689 loss=3.593, nll_loss=2.049, ppl=4.14, wps=549213, ups=1.11, wpb=495158, bsz=16123.1, num_updates=56700, lr=0.000265606, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=20.7, wall=52580 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1153 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=548885, ups=1.11, wpb=494984, bsz=16525.5, num_updates=56800, lr=0.000265372, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=52671 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1254 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=542533, ups=1.09, wpb=495599, bsz=16481.2, num_updates=56900, lr=0.000265139, gnorm=0.154, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=52762 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 epoch 034: 1354 / 1689 loss=3.588, nll_loss=2.043, ppl=4.12, wps=549843, ups=1.11, wpb=496704, bsz=16781, num_updates=57000, lr=0.000264906, gnorm=0.171, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=52852 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034 | valid on 'valid' subset | loss 3.676 | nll_loss 2.117 | ppl 4.34 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.663 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1454 / 1689 loss=3.597, nll_loss=2.053, ppl=4.15, wps=488997, ups=0.99, wpb=495634, bsz=16478.8, num_updates=57100, lr=0.000264674, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=52954 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1554 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=553045, ups=1.12, wpb=493902, bsz=16546.6, num_updates=57200, lr=0.000264443, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53043 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 epoch 034: 1654 / 1689 loss=3.587, nll_loss=2.042, ppl=4.12, wps=551370, ups=1.11, wpb=495588, bsz=16330.2, num_updates=57300, lr=0.000264212, gnorm=0.157, clip=0, loss_scale=1, train_wall=89, gb_free=21.3, wall=53133 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 epoch 034 | loss 3.586 | nll_loss 2.04 | ppl 4.11 | wps 527352 | ups 1.07 | wpb 495115 | bsz 16502.7 | num_updates 57335 | lr 0.000264131 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 1528 | gb_free 22.2 | wall 53164 Start iterating over samples epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 65 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=546472, ups=1.11, wpb=492168, bsz=16369.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=53223 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 165 / 1689 loss=3.571, nll_loss=2.023, ppl=4.07, wps=547425, ups=1.11, wpb=493883, bsz=16350.8, num_updates=57500, lr=0.000263752, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=53313 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 266 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=543941, ups=1.09, wpb=496787, bsz=16492, num_updates=57600, lr=0.000263523, gnorm=0.158, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=53404 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 367 / 1689 loss=3.572, nll_loss=2.025, ppl=4.07, wps=546311, ups=1.1, wpb=496221, bsz=16503.6, num_updates=57700, lr=0.000263295, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.7, wall=53495 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 467 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=546643, ups=1.11, wpb=494032, bsz=16485.3, num_updates=57800, lr=0.000263067, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=89, gb_free=22, wall=53586 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 567 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=549443, ups=1.11, wpb=495676, bsz=16243.5, num_updates=57900, lr=0.00026284, gnorm=0.167, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=53676 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 epoch 035: 667 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=550981, ups=1.11, wpb=494556, bsz=16406.9, num_updates=58000, lr=0.000262613, gnorm=0.171, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=53766 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.665 | nll_loss 2.108 | ppl 4.31 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.663 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 767 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=491614, ups=0.99, wpb=494722, bsz=16807.8, num_updates=58100, lr=0.000262387, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=53866 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 867 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=553097, ups=1.11, wpb=496598, bsz=16537.3, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=53956 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 967 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=549932, ups=1.11, wpb=495080, bsz=16686.6, num_updates=58300, lr=0.000261936, gnorm=0.163, clip=0, loss_scale=1, train_wall=89, gb_free=20, wall=54046 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1067 / 1689 loss=3.584, nll_loss=2.038, ppl=4.11, wps=553826, ups=1.12, wpb=496057, bsz=16356.2, num_updates=58400, lr=0.000261712, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=54136 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1167 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=559185, ups=1.13, wpb=496196, bsz=16385, num_updates=58500, lr=0.000261488, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=54224 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1267 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=551841, ups=1.12, wpb=494458, bsz=16238.9, num_updates=58600, lr=0.000261265, gnorm=0.169, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=54314 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1367 / 1689 loss=3.589, nll_loss=2.044, ppl=4.12, wps=550227, ups=1.11, wpb=493486, bsz=16442, num_updates=58700, lr=0.000261042, gnorm=0.155, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=54404 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1467 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=550505, ups=1.11, wpb=496483, bsz=16614.4, num_updates=58800, lr=0.00026082, gnorm=0.152, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=54494 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1567 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=548498, ups=1.11, wpb=495227, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=54584 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 epoch 035: 1667 / 1689 loss=3.594, nll_loss=2.049, ppl=4.14, wps=548084, ups=1.11, wpb=495313, bsz=16817.8, num_updates=59000, lr=0.000260378, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=54674 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 epoch 035 | valid on 'valid' subset | loss 3.669 | nll_loss 2.112 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.663 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 epoch 035 | loss 3.583 | nll_loss 2.037 | ppl 4.1 | wps 541356 | ups 1.09 | wpb 495117 | bsz 16503.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1494 | gb_free 22.8 | wall 54707 Start iterating over samples epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 78 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=479612, ups=0.98, wpb=491544, bsz=16814.9, num_updates=59100, lr=0.000260157, gnorm=0.157, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=54777 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 181 / 1689 loss=3.566, nll_loss=2.017, ppl=4.05, wps=536060, ups=1.09, wpb=493685, bsz=16200.6, num_updates=59200, lr=0.000259938, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=54869 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 281 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=552353, ups=1.12, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.162, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54959 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 381 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=553445, ups=1.11, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.166, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55048 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 481 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=550140, ups=1.11, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.153, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.3, wall=55139 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 581 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=548065, ups=1.11, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.16, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=55229 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 681 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=551633, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.161, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=55319 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 781 / 1689 loss=3.587, nll_loss=2.041, ppl=4.12, wps=550270, ups=1.11, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.158, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=55408 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 881 / 1689 loss=3.579, nll_loss=2.033, ppl=4.09, wps=550061, ups=1.11, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.157, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55498 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 epoch 036: 981 / 1689 loss=3.578, nll_loss=2.032, ppl=4.09, wps=552144, ups=1.11, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.156, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=55588 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 epoch 036 | valid on 'valid' subset | loss 3.667 | nll_loss 2.111 | ppl 4.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.663 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 epoch 036 | loss 3.575 | nll_loss 2.028 | ppl 4.08 | wps 541991 | ups 1.1 | wpb 494866 | bsz 16451.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.161 | clip 0 | loss_scale 1 | train_wall 863 | gb_free 21.4 | wall 55599 done training in 55585.7 seconds