{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/en-ja.do02.ado01/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:53105', 'distributed_port': 53105, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/en-ja.do02.ado01', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/en-ja.do02.ado01/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/en-ja.do02.ado01', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/en-ja/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.2, attention_dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/en-ja/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=32000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=13.088, nll_loss=12.777, ppl=7019.14, wps=458088, ups=1.05, wpb=435063, bsz=16693.8, num_updates=100, lr=2.5e-05, gnorm=3.095, clip=91, loss_scale=4, train_wall=99, gb_free=18.6, wall=118 epoch 001: 201 / 1689 loss=11.675, nll_loss=11.174, ppl=2310.43, wps=460500, ups=1.06, wpb=435232, bsz=16930.2, num_updates=200, lr=5e-05, gnorm=1.876, clip=90, loss_scale=4, train_wall=94, gb_free=19.4, wall=212 epoch 001: 301 / 1689 loss=11.166, nll_loss=10.569, ppl=1519.56, wps=458648, ups=1.05, wpb=434938, bsz=16709.1, num_updates=300, lr=7.5e-05, gnorm=1.807, clip=96, loss_scale=4, train_wall=94, gb_free=19.9, wall=307 epoch 001: 401 / 1689 loss=10.516, nll_loss=9.797, ppl=889.55, wps=458519, ups=1.06, wpb=432580, bsz=16463.2, num_updates=400, lr=0.0001, gnorm=1.578, clip=96, loss_scale=4, train_wall=94, gb_free=19.5, wall=401 epoch 001: 501 / 1689 loss=9.952, nll_loss=9.122, ppl=557.33, wps=457131, ups=1.05, wpb=433348, bsz=16501.3, num_updates=500, lr=0.000125, gnorm=1.5, clip=99, loss_scale=4, train_wall=94, gb_free=18.6, wall=496 epoch 001: 601 / 1689 loss=9.526, nll_loss=8.614, ppl=391.93, wps=454333, ups=1.05, wpb=433129, bsz=16487, num_updates=600, lr=0.00015, gnorm=1.398, clip=95, loss_scale=8, train_wall=94, gb_free=18.5, wall=591 epoch 001: 701 / 1689 loss=9.149, nll_loss=8.169, ppl=287.83, wps=456378, ups=1.05, wpb=434275, bsz=16441.5, num_updates=700, lr=0.000175, gnorm=1.271, clip=92, loss_scale=8, train_wall=94, gb_free=18.9, wall=687 epoch 001: 801 / 1689 loss=8.78, nll_loss=7.738, ppl=213.49, wps=457948, ups=1.05, wpb=435021, bsz=16338.7, num_updates=800, lr=0.0002, gnorm=1.142, clip=75, loss_scale=8, train_wall=94, gb_free=18.4, wall=782 epoch 001: 902 / 1689 loss=8.417, nll_loss=7.316, ppl=159.35, wps=450460, ups=1.04, wpb=435018, bsz=16516.4, num_updates=900, lr=0.000225, gnorm=1.043, clip=51, loss_scale=4, train_wall=95, gb_free=18.6, wall=878 epoch 001: 1002 / 1689 loss=8.085, nll_loss=6.93, ppl=121.95, wps=453265, ups=1.05, wpb=431938, bsz=16755.5, num_updates=1000, lr=0.00025, gnorm=1.003, clip=46, loss_scale=4, train_wall=94, gb_free=19.6, wall=973 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 7.767 | nll_loss 6.504 | ppl 90.76 | wps 0 | wpb 42662 | bsz 2032 | num_updates 1000 epoch 001: 1102 / 1689 loss=7.743, nll_loss=6.535, ppl=92.72, wps=382162, ups=0.89, wpb=430204, bsz=16536.8, num_updates=1100, lr=0.000275, gnorm=0.971, clip=37, loss_scale=4, train_wall=94, gb_free=18.4, wall=1086 epoch 001: 1202 / 1689 loss=7.429, nll_loss=6.171, ppl=72.08, wps=453662, ups=1.05, wpb=433198, bsz=16274.6, num_updates=1200, lr=0.0003, gnorm=0.932, clip=32, loss_scale=4, train_wall=94, gb_free=20.4, wall=1181 epoch 001: 1302 / 1689 loss=7.089, nll_loss=5.78, ppl=54.95, wps=456089, ups=1.05, wpb=432602, bsz=16372.2, num_updates=1300, lr=0.000325, gnorm=0.907, clip=29, loss_scale=4, train_wall=93, gb_free=20.6, wall=1276 epoch 001: 1402 / 1689 loss=6.788, nll_loss=5.434, ppl=43.24, wps=456215, ups=1.05, wpb=434746, bsz=16342.9, num_updates=1400, lr=0.00035, gnorm=0.845, clip=15, loss_scale=8, train_wall=94, gb_free=18.9, wall=1372 epoch 001: 1502 / 1689 loss=6.511, nll_loss=5.119, ppl=34.75, wps=454742, ups=1.04, wpb=435243, bsz=16512.3, num_updates=1500, lr=0.000375, gnorm=0.785, clip=10, loss_scale=8, train_wall=93, gb_free=18.3, wall=1467 epoch 001: 1602 / 1689 loss=6.293, nll_loss=4.872, ppl=29.28, wps=455451, ups=1.05, wpb=433941, bsz=16442.2, num_updates=1600, lr=0.0004, gnorm=0.709, clip=5, loss_scale=8, train_wall=94, gb_free=19.1, wall=1563 end of epoch 1 (average epoch stats below) epoch 001 | loss 8.749 | nll_loss 7.721 | ppl 211.02 | wps 450763 | ups 1.04 | wpb 433524 | bsz 16505.3 | num_updates 1687 | lr 0.00042175 | gnorm 1.272 | clip 57.2 | loss_scale 8 | train_wall 1589 | gb_free 21 | wall 1645 Start iterating over samples epoch 002: 13 / 1689 loss=6.128, nll_loss=4.687, ppl=25.76, wps=448471, ups=1.04, wpb=429197, bsz=16313.5, num_updates=1700, lr=0.000425, gnorm=0.704, clip=10, loss_scale=8, train_wall=94, gb_free=19.3, wall=1658 epoch 002: 13 / 1689 loss=6.128, nll_loss=4.687, ppl=25.76, wps=448471, ups=1.04, wpb=429197, bsz=16313.5, num_updates=1700, lr=0.000425, gnorm=0.704, clip=10, loss_scale=8, train_wall=94, gb_free=19.3, wall=1658 epoch 002: 113 / 1689 loss=5.96, nll_loss=4.499, ppl=22.61, wps=455212, ups=1.05, wpb=433627, bsz=16774.5, num_updates=1800, lr=0.00045, gnorm=0.609, clip=4, loss_scale=8, train_wall=93, gb_free=19, wall=1754 epoch 002: 113 / 1689 loss=5.96, nll_loss=4.499, ppl=22.61, wps=455212, ups=1.05, wpb=433627, bsz=16774.5, num_updates=1800, lr=0.00045, gnorm=0.609, clip=4, loss_scale=8, train_wall=93, gb_free=19, wall=1754 epoch 002: 214 / 1689 loss=5.843, nll_loss=4.369, ppl=20.67, wps=444850, ups=1.02, wpb=434436, bsz=16456.8, num_updates=1900, lr=0.000475, gnorm=0.593, clip=2, loss_scale=8, train_wall=96, gb_free=19.4, wall=1851 epoch 002: 214 / 1689 loss=5.843, nll_loss=4.369, ppl=20.67, wps=444850, ups=1.02, wpb=434436, bsz=16456.8, num_updates=1900, lr=0.000475, gnorm=0.593, clip=2, loss_scale=8, train_wall=96, gb_free=19.4, wall=1851 epoch 002: 314 / 1689 loss=5.712, nll_loss=4.224, ppl=18.69, wps=455456, ups=1.05, wpb=434872, bsz=16790, num_updates=2000, lr=0.0005, gnorm=0.558, clip=0, loss_scale=8, train_wall=94, gb_free=18.7, wall=1947 epoch 002: 314 / 1689 loss=5.712, nll_loss=4.224, ppl=18.69, wps=455456, ups=1.05, wpb=434872, bsz=16790, num_updates=2000, lr=0.0005, gnorm=0.558, clip=0, loss_scale=8, train_wall=94, gb_free=18.7, wall=1947 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 5.535 | nll_loss 3.943 | ppl 15.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.535 epoch 002 | valid on 'valid' subset | loss 5.535 | nll_loss 3.943 | ppl 15.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.535 epoch 002: 414 / 1689 loss=5.634, nll_loss=4.138, ppl=17.61, wps=375662, ups=0.87, wpb=432589, bsz=16224.5, num_updates=2100, lr=0.000525, gnorm=0.549, clip=4, loss_scale=8, train_wall=93, gb_free=20.3, wall=2062 epoch 002: 414 / 1689 loss=5.634, nll_loss=4.138, ppl=17.61, wps=375662, ups=0.87, wpb=432589, bsz=16224.5, num_updates=2100, lr=0.000525, gnorm=0.549, clip=4, loss_scale=8, train_wall=93, gb_free=20.3, wall=2062 epoch 002: 514 / 1689 loss=5.549, nll_loss=4.045, ppl=16.51, wps=455211, ups=1.05, wpb=435112, bsz=16497.8, num_updates=2200, lr=0.00055, gnorm=0.533, clip=4, loss_scale=8, train_wall=94, gb_free=19.3, wall=2157 epoch 002: 514 / 1689 loss=5.549, nll_loss=4.045, ppl=16.51, wps=455211, ups=1.05, wpb=435112, bsz=16497.8, num_updates=2200, lr=0.00055, gnorm=0.533, clip=4, loss_scale=8, train_wall=94, gb_free=19.3, wall=2157 epoch 002: 615 / 1689 loss=5.44, nll_loss=3.926, ppl=15.2, wps=449726, ups=1.04, wpb=432819, bsz=16665.7, num_updates=2300, lr=0.000575, gnorm=0.474, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=2254 epoch 002: 615 / 1689 loss=5.44, nll_loss=3.926, ppl=15.2, wps=449726, ups=1.04, wpb=432819, bsz=16665.7, num_updates=2300, lr=0.000575, gnorm=0.474, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=2254 epoch 002: 715 / 1689 loss=5.376, nll_loss=3.856, ppl=14.48, wps=454249, ups=1.05, wpb=434076, bsz=16179.8, num_updates=2400, lr=0.0006, gnorm=0.502, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=2349 epoch 002: 715 / 1689 loss=5.376, nll_loss=3.856, ppl=14.48, wps=454249, ups=1.05, wpb=434076, bsz=16179.8, num_updates=2400, lr=0.0006, gnorm=0.502, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=2349 epoch 002: 815 / 1689 loss=5.304, nll_loss=3.778, ppl=13.72, wps=454249, ups=1.05, wpb=431461, bsz=16785, num_updates=2500, lr=0.000625, gnorm=0.439, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=2444 epoch 002: 815 / 1689 loss=5.304, nll_loss=3.778, ppl=13.72, wps=454249, ups=1.05, wpb=431461, bsz=16785, num_updates=2500, lr=0.000625, gnorm=0.439, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=2444 epoch 002: 915 / 1689 loss=5.26, nll_loss=3.73, ppl=13.27, wps=459990, ups=1.06, wpb=434198, bsz=16537.8, num_updates=2600, lr=0.00065, gnorm=0.462, clip=0, loss_scale=4, train_wall=93, gb_free=21.1, wall=2539 epoch 002: 915 / 1689 loss=5.26, nll_loss=3.73, ppl=13.27, wps=459990, ups=1.06, wpb=434198, bsz=16537.8, num_updates=2600, lr=0.00065, gnorm=0.462, clip=0, loss_scale=4, train_wall=93, gb_free=21.1, wall=2539 epoch 002: 1015 / 1689 loss=5.198, nll_loss=3.664, ppl=12.67, wps=457447, ups=1.06, wpb=432976, bsz=16357.6, num_updates=2700, lr=0.000675, gnorm=0.479, clip=2, loss_scale=4, train_wall=93, gb_free=19.9, wall=2633 epoch 002: 1015 / 1689 loss=5.198, nll_loss=3.664, ppl=12.67, wps=457447, ups=1.06, wpb=432976, bsz=16357.6, num_updates=2700, lr=0.000675, gnorm=0.479, clip=2, loss_scale=4, train_wall=93, gb_free=19.9, wall=2633 epoch 002: 1115 / 1689 loss=5.158, nll_loss=3.62, ppl=12.3, wps=454655, ups=1.05, wpb=434531, bsz=16415.1, num_updates=2800, lr=0.0007, gnorm=0.403, clip=0, loss_scale=8, train_wall=93, gb_free=18, wall=2729 epoch 002: 1115 / 1689 loss=5.158, nll_loss=3.62, ppl=12.3, wps=454655, ups=1.05, wpb=434531, bsz=16415.1, num_updates=2800, lr=0.0007, gnorm=0.403, clip=0, loss_scale=8, train_wall=93, gb_free=18, wall=2729 epoch 002: 1216 / 1689 loss=5.116, nll_loss=3.574, ppl=11.91, wps=451864, ups=1.04, wpb=434440, bsz=16574.9, num_updates=2900, lr=0.000725, gnorm=0.453, clip=0, loss_scale=4, train_wall=95, gb_free=18.6, wall=2825 epoch 002: 1216 / 1689 loss=5.116, nll_loss=3.574, ppl=11.91, wps=451864, ups=1.04, wpb=434440, bsz=16574.9, num_updates=2900, lr=0.000725, gnorm=0.453, clip=0, loss_scale=4, train_wall=95, gb_free=18.6, wall=2825 epoch 002: 1316 / 1689 loss=5.072, nll_loss=3.527, ppl=11.53, wps=450536, ups=1.04, wpb=432917, bsz=16800.8, num_updates=3000, lr=0.00075, gnorm=0.422, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=2921 epoch 002: 1316 / 1689 loss=5.072, nll_loss=3.527, ppl=11.53, wps=450536, ups=1.04, wpb=432917, bsz=16800.8, num_updates=3000, lr=0.00075, gnorm=0.422, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=2921 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.943 | nll_loss 3.315 | ppl 9.95 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 4.943 epoch 002 | valid on 'valid' subset | loss 4.943 | nll_loss 3.315 | ppl 9.95 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 4.943 epoch 002: 1416 / 1689 loss=5.036, nll_loss=3.488, ppl=11.22, wps=384654, ups=0.89, wpb=433486, bsz=16253.3, num_updates=3100, lr=0.000775, gnorm=0.432, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=3034 epoch 002: 1416 / 1689 loss=5.036, nll_loss=3.488, ppl=11.22, wps=384654, ups=0.89, wpb=433486, bsz=16253.3, num_updates=3100, lr=0.000775, gnorm=0.432, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=3034 epoch 002: 1516 / 1689 loss=5.019, nll_loss=3.471, ppl=11.09, wps=458900, ups=1.06, wpb=433644, bsz=16473, num_updates=3200, lr=0.0008, gnorm=0.417, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=3128 epoch 002: 1516 / 1689 loss=5.019, nll_loss=3.471, ppl=11.09, wps=458900, ups=1.06, wpb=433644, bsz=16473, num_updates=3200, lr=0.0008, gnorm=0.417, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=3128 epoch 002: 1616 / 1689 loss=4.969, nll_loss=3.416, ppl=10.67, wps=456912, ups=1.05, wpb=433900, bsz=16393.8, num_updates=3300, lr=0.000825, gnorm=0.41, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=3223 epoch 002: 1616 / 1689 loss=4.969, nll_loss=3.416, ppl=10.67, wps=456912, ups=1.05, wpb=433900, bsz=16393.8, num_updates=3300, lr=0.000825, gnorm=0.41, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=3223 end of epoch 2 (average epoch stats below) epoch 002 | loss 5.342 | nll_loss 3.822 | ppl 14.14 | wps 443708 | ups 1.02 | wpb 433540 | bsz 16501.5 | num_updates 3372 | lr 0.000843 | gnorm 0.483 | clip 1.2 | loss_scale 4 | train_wall 1578 | gb_free 20.3 | wall 3291 epoch 002 | loss 5.342 | nll_loss 3.822 | ppl 14.14 | wps 443708 | ups 1.02 | wpb 433540 | bsz 16501.5 | num_updates 3372 | lr 0.000843 | gnorm 0.483 | clip 1.2 | loss_scale 4 | train_wall 1578 | gb_free 20.3 | wall 3291 Start iterating over samples epoch 003: 28 / 1689 loss=4.969, nll_loss=3.417, ppl=10.68, wps=452345, ups=1.05, wpb=430900, bsz=16328.9, num_updates=3400, lr=0.00085, gnorm=0.442, clip=1, loss_scale=4, train_wall=93, gb_free=17.6, wall=3319 epoch 003: 28 / 1689 loss=4.969, nll_loss=3.417, ppl=10.68, wps=452345, ups=1.05, wpb=430900, bsz=16328.9, num_updates=3400, lr=0.00085, gnorm=0.442, clip=1, loss_scale=4, train_wall=93, gb_free=17.6, wall=3319 epoch 003: 28 / 1689 loss=4.969, nll_loss=3.417, ppl=10.68, wps=452345, ups=1.05, wpb=430900, bsz=16328.9, num_updates=3400, lr=0.00085, gnorm=0.442, clip=1, loss_scale=4, train_wall=93, gb_free=17.6, wall=3319 epoch 003: 128 / 1689 loss=4.912, nll_loss=3.354, ppl=10.22, wps=459826, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.393, clip=0, loss_scale=4, train_wall=93, gb_free=17.6, wall=3413 epoch 003: 128 / 1689 loss=4.912, nll_loss=3.354, ppl=10.22, wps=459826, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.393, clip=0, loss_scale=4, train_wall=93, gb_free=17.6, wall=3413 epoch 003: 128 / 1689 loss=4.912, nll_loss=3.354, ppl=10.22, wps=459826, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.393, clip=0, loss_scale=4, train_wall=93, gb_free=17.6, wall=3413 epoch 003: 228 / 1689 loss=4.908, nll_loss=3.35, ppl=10.2, wps=457529, ups=1.05, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.425, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=3508 epoch 003: 228 / 1689 loss=4.908, nll_loss=3.35, ppl=10.2, wps=457529, ups=1.05, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.425, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=3508 epoch 003: 228 / 1689 loss=4.908, nll_loss=3.35, ppl=10.2, wps=457529, ups=1.05, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.425, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=3508 epoch 003: 328 / 1689 loss=4.88, nll_loss=3.32, ppl=9.99, wps=455499, ups=1.06, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.418, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=3603 epoch 003: 328 / 1689 loss=4.88, nll_loss=3.32, ppl=9.99, wps=455499, ups=1.06, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.418, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=3603 epoch 003: 328 / 1689 loss=4.88, nll_loss=3.32, ppl=9.99, wps=455499, ups=1.06, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.418, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=3603 epoch 003: 428 / 1689 loss=4.874, nll_loss=3.315, ppl=9.95, wps=455504, ups=1.05, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.403, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=3698 epoch 003: 428 / 1689 loss=4.874, nll_loss=3.315, ppl=9.95, wps=455504, ups=1.05, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.403, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=3698 epoch 003: 428 / 1689 loss=4.874, nll_loss=3.315, ppl=9.95, wps=455504, ups=1.05, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.403, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=3698 epoch 003: 529 / 1689 loss=4.868, nll_loss=3.309, ppl=9.91, wps=448880, ups=1.03, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.419, clip=0, loss_scale=2, train_wall=95, gb_free=18.6, wall=3795 epoch 003: 529 / 1689 loss=4.868, nll_loss=3.309, ppl=9.91, wps=448880, ups=1.03, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.419, clip=0, loss_scale=2, train_wall=95, gb_free=18.6, wall=3795 epoch 003: 529 / 1689 loss=4.868, nll_loss=3.309, ppl=9.91, wps=448880, ups=1.03, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.419, clip=0, loss_scale=2, train_wall=95, gb_free=18.6, wall=3795 epoch 003: 629 / 1689 loss=4.847, nll_loss=3.286, ppl=9.75, wps=456515, ups=1.05, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.407, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=3890 epoch 003: 629 / 1689 loss=4.847, nll_loss=3.286, ppl=9.75, wps=456515, ups=1.05, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.407, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=3890 epoch 003: 629 / 1689 loss=4.847, nll_loss=3.286, ppl=9.75, wps=456515, ups=1.05, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.407, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=3890 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.733 | nll_loss 3.114 | ppl 8.66 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.733 epoch 003 | valid on 'valid' subset | loss 4.733 | nll_loss 3.114 | ppl 8.66 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.733 epoch 003 | valid on 'valid' subset | loss 4.733 | nll_loss 3.114 | ppl 8.66 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.733 epoch 003: 729 / 1689 loss=4.847, nll_loss=3.286, ppl=9.76, wps=376370, ups=0.87, wpb=434512, bsz=16404.5, num_updates=4100, lr=0.00098773, gnorm=0.417, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=4005 epoch 003: 729 / 1689 loss=4.847, nll_loss=3.286, ppl=9.76, wps=376370, ups=0.87, wpb=434512, bsz=16404.5, num_updates=4100, lr=0.00098773, gnorm=0.417, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=4005 epoch 003: 729 / 1689 loss=4.847, nll_loss=3.286, ppl=9.76, wps=376370, ups=0.87, wpb=434512, bsz=16404.5, num_updates=4100, lr=0.00098773, gnorm=0.417, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=4005 epoch 003: 829 / 1689 loss=4.816, nll_loss=3.253, ppl=9.53, wps=457080, ups=1.05, wpb=433330, bsz=16200.6, num_updates=4200, lr=0.0009759, gnorm=0.387, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=4100 epoch 003: 829 / 1689 loss=4.816, nll_loss=3.253, ppl=9.53, wps=457080, ups=1.05, wpb=433330, bsz=16200.6, num_updates=4200, lr=0.0009759, gnorm=0.387, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=4100 epoch 003: 829 / 1689 loss=4.816, nll_loss=3.253, ppl=9.53, wps=457080, ups=1.05, wpb=433330, bsz=16200.6, num_updates=4200, lr=0.0009759, gnorm=0.387, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=4100 epoch 003: 929 / 1689 loss=4.797, nll_loss=3.233, ppl=9.4, wps=457320, ups=1.06, wpb=432709, bsz=16375.2, num_updates=4300, lr=0.000964486, gnorm=0.4, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=4195 epoch 003: 929 / 1689 loss=4.797, nll_loss=3.233, ppl=9.4, wps=457320, ups=1.06, wpb=432709, bsz=16375.2, num_updates=4300, lr=0.000964486, gnorm=0.4, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=4195 epoch 003: 929 / 1689 loss=4.797, nll_loss=3.233, ppl=9.4, wps=457320, ups=1.06, wpb=432709, bsz=16375.2, num_updates=4300, lr=0.000964486, gnorm=0.4, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=4195 epoch 003: 1029 / 1689 loss=4.794, nll_loss=3.23, ppl=9.38, wps=456865, ups=1.05, wpb=434450, bsz=16334.5, num_updates=4400, lr=0.000953463, gnorm=0.371, clip=0, loss_scale=4, train_wall=93, gb_free=17.9, wall=4290 epoch 003: 1029 / 1689 loss=4.794, nll_loss=3.23, ppl=9.38, wps=456865, ups=1.05, wpb=434450, bsz=16334.5, num_updates=4400, lr=0.000953463, gnorm=0.371, clip=0, loss_scale=4, train_wall=93, gb_free=17.9, wall=4290 epoch 003: 1029 / 1689 loss=4.794, nll_loss=3.23, ppl=9.38, wps=456865, ups=1.05, wpb=434450, bsz=16334.5, num_updates=4400, lr=0.000953463, gnorm=0.371, clip=0, loss_scale=4, train_wall=93, gb_free=17.9, wall=4290 epoch 003: 1129 / 1689 loss=4.779, nll_loss=3.214, ppl=9.28, wps=457874, ups=1.05, wpb=437337, bsz=16945.8, num_updates=4500, lr=0.000942809, gnorm=0.382, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=4385 epoch 003: 1129 / 1689 loss=4.779, nll_loss=3.214, ppl=9.28, wps=457874, ups=1.05, wpb=437337, bsz=16945.8, num_updates=4500, lr=0.000942809, gnorm=0.382, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=4385 epoch 003: 1129 / 1689 loss=4.779, nll_loss=3.214, ppl=9.28, wps=457874, ups=1.05, wpb=437337, bsz=16945.8, num_updates=4500, lr=0.000942809, gnorm=0.382, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=4385 epoch 003: 1230 / 1689 loss=4.761, nll_loss=3.195, ppl=9.16, wps=454706, ups=1.05, wpb=433765, bsz=16347.4, num_updates=4600, lr=0.000932505, gnorm=0.374, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=4481 epoch 003: 1230 / 1689 loss=4.761, nll_loss=3.195, ppl=9.16, wps=454706, ups=1.05, wpb=433765, bsz=16347.4, num_updates=4600, lr=0.000932505, gnorm=0.374, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=4481 epoch 003: 1230 / 1689 loss=4.761, nll_loss=3.195, ppl=9.16, wps=454706, ups=1.05, wpb=433765, bsz=16347.4, num_updates=4600, lr=0.000932505, gnorm=0.374, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=4481 epoch 003: 1330 / 1689 loss=4.736, nll_loss=3.167, ppl=8.98, wps=453606, ups=1.05, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.364, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=4576 epoch 003: 1330 / 1689 loss=4.736, nll_loss=3.167, ppl=8.98, wps=453606, ups=1.05, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.364, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=4576 epoch 003: 1330 / 1689 loss=4.736, nll_loss=3.167, ppl=8.98, wps=453606, ups=1.05, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.364, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=4576 epoch 003: 1430 / 1689 loss=4.738, nll_loss=3.17, ppl=9, wps=458325, ups=1.06, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=4671 epoch 003: 1430 / 1689 loss=4.738, nll_loss=3.17, ppl=9, wps=458325, ups=1.06, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=4671 epoch 003: 1430 / 1689 loss=4.738, nll_loss=3.17, ppl=9, wps=458325, ups=1.06, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=4671 epoch 003: 1530 / 1689 loss=4.711, nll_loss=3.14, ppl=8.82, wps=452259, ups=1.05, wpb=432092, bsz=16523.6, num_updates=4900, lr=0.000903508, gnorm=0.36, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=4766 epoch 003: 1530 / 1689 loss=4.711, nll_loss=3.14, ppl=8.82, wps=452259, ups=1.05, wpb=432092, bsz=16523.6, num_updates=4900, lr=0.000903508, gnorm=0.36, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=4766 epoch 003: 1530 / 1689 loss=4.711, nll_loss=3.14, ppl=8.82, wps=452259, ups=1.05, wpb=432092, bsz=16523.6, num_updates=4900, lr=0.000903508, gnorm=0.36, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=4766 epoch 003: 1630 / 1689 loss=4.703, nll_loss=3.132, ppl=8.76, wps=455164, ups=1.05, wpb=431600, bsz=16164.2, num_updates=5000, lr=0.000894427, gnorm=0.359, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=4861 epoch 003: 1630 / 1689 loss=4.703, nll_loss=3.132, ppl=8.76, wps=455164, ups=1.05, wpb=431600, bsz=16164.2, num_updates=5000, lr=0.000894427, gnorm=0.359, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=4861 epoch 003: 1630 / 1689 loss=4.703, nll_loss=3.132, ppl=8.76, wps=455164, ups=1.05, wpb=431600, bsz=16164.2, num_updates=5000, lr=0.000894427, gnorm=0.359, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=4861 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.625 | nll_loss 2.996 | ppl 7.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.625 epoch 003 | valid on 'valid' subset | loss 4.625 | nll_loss 2.996 | ppl 7.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.625 epoch 003 | valid on 'valid' subset | loss 4.625 | nll_loss 2.996 | ppl 7.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.625 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.809 | nll_loss 3.245 | ppl 9.48 | wps 444486 | ups 1.03 | wpb 433510 | bsz 16499.4 | num_updates 5059 | lr 0.000889196 | gnorm 0.391 | clip 0.1 | loss_scale 2 | train_wall 1575 | gb_free 19.4 | wall 4937 epoch 003 | loss 4.809 | nll_loss 3.245 | ppl 9.48 | wps 444486 | ups 1.03 | wpb 433510 | bsz 16499.4 | num_updates 5059 | lr 0.000889196 | gnorm 0.391 | clip 0.1 | loss_scale 2 | train_wall 1575 | gb_free 19.4 | wall 4937 epoch 003 | loss 4.809 | nll_loss 3.245 | ppl 9.48 | wps 444486 | ups 1.03 | wpb 433510 | bsz 16499.4 | num_updates 5059 | lr 0.000889196 | gnorm 0.391 | clip 0.1 | loss_scale 2 | train_wall 1575 | gb_free 19.4 | wall 4937 Start iterating over samples epoch 004: 41 / 1689 loss=4.676, nll_loss=3.102, ppl=8.59, wps=370502, ups=0.86, wpb=430678, bsz=16840.2, num_updates=5100, lr=0.000885615, gnorm=0.354, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=4977 epoch 004: 41 / 1689 loss=4.676, nll_loss=3.102, ppl=8.59, wps=370502, ups=0.86, wpb=430678, bsz=16840.2, num_updates=5100, lr=0.000885615, gnorm=0.354, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=4977 epoch 004: 41 / 1689 loss=4.676, nll_loss=3.102, ppl=8.59, wps=370502, ups=0.86, wpb=430678, bsz=16840.2, num_updates=5100, lr=0.000885615, gnorm=0.354, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=4977 epoch 004: 41 / 1689 loss=4.676, nll_loss=3.102, ppl=8.59, wps=370502, ups=0.86, wpb=430678, bsz=16840.2, num_updates=5100, lr=0.000885615, gnorm=0.354, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=4977 epoch 004: 141 / 1689 loss=4.656, nll_loss=3.079, ppl=8.45, wps=458414, ups=1.06, wpb=433564, bsz=16394.5, num_updates=5200, lr=0.000877058, gnorm=0.344, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5072 epoch 004: 141 / 1689 loss=4.656, nll_loss=3.079, ppl=8.45, wps=458414, ups=1.06, wpb=433564, bsz=16394.5, num_updates=5200, lr=0.000877058, gnorm=0.344, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5072 epoch 004: 141 / 1689 loss=4.656, nll_loss=3.079, ppl=8.45, wps=458414, ups=1.06, wpb=433564, bsz=16394.5, num_updates=5200, lr=0.000877058, gnorm=0.344, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5072 epoch 004: 141 / 1689 loss=4.656, nll_loss=3.079, ppl=8.45, wps=458414, ups=1.06, wpb=433564, bsz=16394.5, num_updates=5200, lr=0.000877058, gnorm=0.344, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5072 epoch 004: 241 / 1689 loss=4.648, nll_loss=3.07, ppl=8.4, wps=455424, ups=1.05, wpb=431917, bsz=16263.4, num_updates=5300, lr=0.000868744, gnorm=0.355, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5167 epoch 004: 241 / 1689 loss=4.648, nll_loss=3.07, ppl=8.4, wps=455424, ups=1.05, wpb=431917, bsz=16263.4, num_updates=5300, lr=0.000868744, gnorm=0.355, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5167 epoch 004: 241 / 1689 loss=4.648, nll_loss=3.07, ppl=8.4, wps=455424, ups=1.05, wpb=431917, bsz=16263.4, num_updates=5300, lr=0.000868744, gnorm=0.355, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5167 epoch 004: 241 / 1689 loss=4.648, nll_loss=3.07, ppl=8.4, wps=455424, ups=1.05, wpb=431917, bsz=16263.4, num_updates=5300, lr=0.000868744, gnorm=0.355, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5167 epoch 004: 342 / 1689 loss=4.65, nll_loss=3.073, ppl=8.42, wps=449204, ups=1.03, wpb=434629, bsz=16700, num_updates=5400, lr=0.000860663, gnorm=0.343, clip=0, loss_scale=2, train_wall=95, gb_free=18.3, wall=5264 epoch 004: 342 / 1689 loss=4.65, nll_loss=3.073, ppl=8.42, wps=449204, ups=1.03, wpb=434629, bsz=16700, num_updates=5400, lr=0.000860663, gnorm=0.343, clip=0, loss_scale=2, train_wall=95, gb_free=18.3, wall=5264 epoch 004: 342 / 1689 loss=4.65, nll_loss=3.073, ppl=8.42, wps=449204, ups=1.03, wpb=434629, bsz=16700, num_updates=5400, lr=0.000860663, gnorm=0.343, clip=0, loss_scale=2, train_wall=95, gb_free=18.3, wall=5264 epoch 004: 342 / 1689 loss=4.65, nll_loss=3.073, ppl=8.42, wps=449204, ups=1.03, wpb=434629, bsz=16700, num_updates=5400, lr=0.000860663, gnorm=0.343, clip=0, loss_scale=2, train_wall=95, gb_free=18.3, wall=5264 epoch 004: 442 / 1689 loss=4.641, nll_loss=3.063, ppl=8.36, wps=456085, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=5358 epoch 004: 442 / 1689 loss=4.641, nll_loss=3.063, ppl=8.36, wps=456085, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=5358 epoch 004: 442 / 1689 loss=4.641, nll_loss=3.063, ppl=8.36, wps=456085, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=5358 epoch 004: 442 / 1689 loss=4.641, nll_loss=3.063, ppl=8.36, wps=456085, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=5358 epoch 004: 542 / 1689 loss=4.64, nll_loss=3.063, ppl=8.36, wps=459560, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.345, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=5453 epoch 004: 542 / 1689 loss=4.64, nll_loss=3.063, ppl=8.36, wps=459560, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.345, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=5453 epoch 004: 542 / 1689 loss=4.64, nll_loss=3.063, ppl=8.36, wps=459560, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.345, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=5453 epoch 004: 542 / 1689 loss=4.64, nll_loss=3.063, ppl=8.36, wps=459560, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.345, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=5453 epoch 004: 642 / 1689 loss=4.623, nll_loss=3.045, ppl=8.25, wps=459553, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5547 epoch 004: 642 / 1689 loss=4.623, nll_loss=3.045, ppl=8.25, wps=459553, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5547 epoch 004: 642 / 1689 loss=4.623, nll_loss=3.045, ppl=8.25, wps=459553, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5547 epoch 004: 642 / 1689 loss=4.623, nll_loss=3.045, ppl=8.25, wps=459553, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5547 epoch 004: 742 / 1689 loss=4.622, nll_loss=3.043, ppl=8.24, wps=455382, ups=1.05, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.352, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=5643 epoch 004: 742 / 1689 loss=4.622, nll_loss=3.043, ppl=8.24, wps=455382, ups=1.05, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.352, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=5643 epoch 004: 742 / 1689 loss=4.622, nll_loss=3.043, ppl=8.24, wps=455382, ups=1.05, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.352, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=5643 epoch 004: 742 / 1689 loss=4.622, nll_loss=3.043, ppl=8.24, wps=455382, ups=1.05, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.352, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=5643 epoch 004: 842 / 1689 loss=4.609, nll_loss=3.029, ppl=8.16, wps=456910, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.335, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=5738 epoch 004: 842 / 1689 loss=4.609, nll_loss=3.029, ppl=8.16, wps=456910, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.335, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=5738 epoch 004: 842 / 1689 loss=4.609, nll_loss=3.029, ppl=8.16, wps=456910, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.335, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=5738 epoch 004: 842 / 1689 loss=4.609, nll_loss=3.029, ppl=8.16, wps=456910, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.335, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=5738 epoch 004: 943 / 1689 loss=4.601, nll_loss=3.021, ppl=8.12, wps=450429, ups=1.04, wpb=434212, bsz=16379.9, num_updates=6000, lr=0.000816497, gnorm=0.337, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=5834 epoch 004: 943 / 1689 loss=4.601, nll_loss=3.021, ppl=8.12, wps=450429, ups=1.04, wpb=434212, bsz=16379.9, num_updates=6000, lr=0.000816497, gnorm=0.337, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=5834 epoch 004: 943 / 1689 loss=4.601, nll_loss=3.021, ppl=8.12, wps=450429, ups=1.04, wpb=434212, bsz=16379.9, num_updates=6000, lr=0.000816497, gnorm=0.337, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=5834 epoch 004: 943 / 1689 loss=4.601, nll_loss=3.021, ppl=8.12, wps=450429, ups=1.04, wpb=434212, bsz=16379.9, num_updates=6000, lr=0.000816497, gnorm=0.337, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=5834 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.546 | nll_loss 2.913 | ppl 7.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.546 epoch 004 | valid on 'valid' subset | loss 4.546 | nll_loss 2.913 | ppl 7.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.546 epoch 004 | valid on 'valid' subset | loss 4.546 | nll_loss 2.913 | ppl 7.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.546 epoch 004 | valid on 'valid' subset | loss 4.546 | nll_loss 2.913 | ppl 7.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.546 epoch 004: 1043 / 1689 loss=4.603, nll_loss=3.024, ppl=8.13, wps=384447, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=5947 epoch 004: 1043 / 1689 loss=4.603, nll_loss=3.024, ppl=8.13, wps=384447, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=5947 epoch 004: 1043 / 1689 loss=4.603, nll_loss=3.024, ppl=8.13, wps=384447, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=5947 epoch 004: 1043 / 1689 loss=4.603, nll_loss=3.024, ppl=8.13, wps=384447, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=5947 epoch 004: 1143 / 1689 loss=4.592, nll_loss=3.011, ppl=8.06, wps=455548, ups=1.05, wpb=434608, bsz=16657.3, num_updates=6200, lr=0.000803219, gnorm=0.333, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=6042 epoch 004: 1143 / 1689 loss=4.592, nll_loss=3.011, ppl=8.06, wps=455548, ups=1.05, wpb=434608, bsz=16657.3, num_updates=6200, lr=0.000803219, gnorm=0.333, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=6042 epoch 004: 1143 / 1689 loss=4.592, nll_loss=3.011, ppl=8.06, wps=455548, ups=1.05, wpb=434608, bsz=16657.3, num_updates=6200, lr=0.000803219, gnorm=0.333, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=6042 epoch 004: 1143 / 1689 loss=4.592, nll_loss=3.011, ppl=8.06, wps=455548, ups=1.05, wpb=434608, bsz=16657.3, num_updates=6200, lr=0.000803219, gnorm=0.333, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=6042 epoch 004: 1243 / 1689 loss=4.586, nll_loss=3.005, ppl=8.03, wps=457770, ups=1.05, wpb=434700, bsz=16346.2, num_updates=6300, lr=0.000796819, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=20.7, wall=6137 epoch 004: 1243 / 1689 loss=4.586, nll_loss=3.005, ppl=8.03, wps=457770, ups=1.05, wpb=434700, bsz=16346.2, num_updates=6300, lr=0.000796819, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=20.7, wall=6137 epoch 004: 1243 / 1689 loss=4.586, nll_loss=3.005, ppl=8.03, wps=457770, ups=1.05, wpb=434700, bsz=16346.2, num_updates=6300, lr=0.000796819, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=20.7, wall=6137 epoch 004: 1243 / 1689 loss=4.586, nll_loss=3.005, ppl=8.03, wps=457770, ups=1.05, wpb=434700, bsz=16346.2, num_updates=6300, lr=0.000796819, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=20.7, wall=6137 epoch 004: 1343 / 1689 loss=4.581, nll_loss=3, ppl=8, wps=457529, ups=1.06, wpb=432575, bsz=16655.4, num_updates=6400, lr=0.000790569, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=6232 epoch 004: 1343 / 1689 loss=4.581, nll_loss=3, ppl=8, wps=457529, ups=1.06, wpb=432575, bsz=16655.4, num_updates=6400, lr=0.000790569, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=6232 epoch 004: 1343 / 1689 loss=4.581, nll_loss=3, ppl=8, wps=457529, ups=1.06, wpb=432575, bsz=16655.4, num_updates=6400, lr=0.000790569, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=6232 epoch 004: 1343 / 1689 loss=4.581, nll_loss=3, ppl=8, wps=457529, ups=1.06, wpb=432575, bsz=16655.4, num_updates=6400, lr=0.000790569, gnorm=0.329, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=6232 epoch 004: 1443 / 1689 loss=4.573, nll_loss=2.991, ppl=7.95, wps=461484, ups=1.06, wpb=434366, bsz=16711.3, num_updates=6500, lr=0.000784465, gnorm=0.325, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6326 epoch 004: 1443 / 1689 loss=4.573, nll_loss=2.991, ppl=7.95, wps=461484, ups=1.06, wpb=434366, bsz=16711.3, num_updates=6500, lr=0.000784465, gnorm=0.325, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6326 epoch 004: 1443 / 1689 loss=4.573, nll_loss=2.991, ppl=7.95, wps=461484, ups=1.06, wpb=434366, bsz=16711.3, num_updates=6500, lr=0.000784465, gnorm=0.325, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6326 epoch 004: 1443 / 1689 loss=4.573, nll_loss=2.991, ppl=7.95, wps=461484, ups=1.06, wpb=434366, bsz=16711.3, num_updates=6500, lr=0.000784465, gnorm=0.325, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6326 epoch 004: 1543 / 1689 loss=4.574, nll_loss=2.992, ppl=7.96, wps=458328, ups=1.06, wpb=434368, bsz=16276.8, num_updates=6600, lr=0.000778499, gnorm=0.321, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6421 epoch 004: 1543 / 1689 loss=4.574, nll_loss=2.992, ppl=7.96, wps=458328, ups=1.06, wpb=434368, bsz=16276.8, num_updates=6600, lr=0.000778499, gnorm=0.321, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6421 epoch 004: 1543 / 1689 loss=4.574, nll_loss=2.992, ppl=7.96, wps=458328, ups=1.06, wpb=434368, bsz=16276.8, num_updates=6600, lr=0.000778499, gnorm=0.321, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6421 epoch 004: 1543 / 1689 loss=4.574, nll_loss=2.992, ppl=7.96, wps=458328, ups=1.06, wpb=434368, bsz=16276.8, num_updates=6600, lr=0.000778499, gnorm=0.321, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6421 epoch 004: 1643 / 1689 loss=4.572, nll_loss=2.991, ppl=7.95, wps=458927, ups=1.06, wpb=432801, bsz=16221.1, num_updates=6700, lr=0.000772667, gnorm=0.325, clip=0, loss_scale=4, train_wall=92, gb_free=18.4, wall=6515 epoch 004: 1643 / 1689 loss=4.572, nll_loss=2.991, ppl=7.95, wps=458927, ups=1.06, wpb=432801, bsz=16221.1, num_updates=6700, lr=0.000772667, gnorm=0.325, clip=0, loss_scale=4, train_wall=92, gb_free=18.4, wall=6515 epoch 004: 1643 / 1689 loss=4.572, nll_loss=2.991, ppl=7.95, wps=458927, ups=1.06, wpb=432801, bsz=16221.1, num_updates=6700, lr=0.000772667, gnorm=0.325, clip=0, loss_scale=4, train_wall=92, gb_free=18.4, wall=6515 epoch 004: 1643 / 1689 loss=4.572, nll_loss=2.991, ppl=7.95, wps=458927, ups=1.06, wpb=432801, bsz=16221.1, num_updates=6700, lr=0.000772667, gnorm=0.325, clip=0, loss_scale=4, train_wall=92, gb_free=18.4, wall=6515 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.61 | nll_loss 3.031 | ppl 8.17 | wps 451233 | ups 1.04 | wpb 433539 | bsz 16507.3 | num_updates 6746 | lr 0.000770029 | gnorm 0.335 | clip 0 | loss_scale 4 | train_wall 1573 | gb_free 19.4 | wall 6558 epoch 004 | loss 4.61 | nll_loss 3.031 | ppl 8.17 | wps 451233 | ups 1.04 | wpb 433539 | bsz 16507.3 | num_updates 6746 | lr 0.000770029 | gnorm 0.335 | clip 0 | loss_scale 4 | train_wall 1573 | gb_free 19.4 | wall 6558 epoch 004 | loss 4.61 | nll_loss 3.031 | ppl 8.17 | wps 451233 | ups 1.04 | wpb 433539 | bsz 16507.3 | num_updates 6746 | lr 0.000770029 | gnorm 0.335 | clip 0 | loss_scale 4 | train_wall 1573 | gb_free 19.4 | wall 6558 epoch 004 | loss 4.61 | nll_loss 3.031 | ppl 8.17 | wps 451233 | ups 1.04 | wpb 433539 | bsz 16507.3 | num_updates 6746 | lr 0.000770029 | gnorm 0.335 | clip 0 | loss_scale 4 | train_wall 1573 | gb_free 19.4 | wall 6558 Start iterating over samples epoch 005: 54 / 1689 loss=4.534, nll_loss=2.947, ppl=7.71, wps=451576, ups=1.05, wpb=429170, bsz=16259, num_updates=6800, lr=0.000766965, gnorm=0.314, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=6610 epoch 005: 54 / 1689 loss=4.534, nll_loss=2.947, ppl=7.71, wps=451576, ups=1.05, wpb=429170, bsz=16259, num_updates=6800, lr=0.000766965, gnorm=0.314, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=6610 epoch 005: 54 / 1689 loss=4.534, nll_loss=2.947, ppl=7.71, wps=451576, ups=1.05, wpb=429170, bsz=16259, num_updates=6800, lr=0.000766965, gnorm=0.314, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=6610 epoch 005: 54 / 1689 loss=4.534, nll_loss=2.947, ppl=7.71, wps=451576, ups=1.05, wpb=429170, bsz=16259, num_updates=6800, lr=0.000766965, gnorm=0.314, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=6610 epoch 005: 54 / 1689 loss=4.534, nll_loss=2.947, ppl=7.71, wps=451576, ups=1.05, wpb=429170, bsz=16259, num_updates=6800, lr=0.000766965, gnorm=0.314, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=6610 epoch 005: 155 / 1689 loss=4.529, nll_loss=2.941, ppl=7.68, wps=454411, ups=1.05, wpb=434130, bsz=16388, num_updates=6900, lr=0.000761387, gnorm=0.319, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=6706 epoch 005: 155 / 1689 loss=4.529, nll_loss=2.941, ppl=7.68, wps=454411, ups=1.05, wpb=434130, bsz=16388, num_updates=6900, lr=0.000761387, gnorm=0.319, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=6706 epoch 005: 155 / 1689 loss=4.529, nll_loss=2.941, ppl=7.68, wps=454411, ups=1.05, wpb=434130, bsz=16388, num_updates=6900, lr=0.000761387, gnorm=0.319, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=6706 epoch 005: 155 / 1689 loss=4.529, nll_loss=2.941, ppl=7.68, wps=454411, ups=1.05, wpb=434130, bsz=16388, num_updates=6900, lr=0.000761387, gnorm=0.319, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=6706 epoch 005: 155 / 1689 loss=4.529, nll_loss=2.941, ppl=7.68, wps=454411, ups=1.05, wpb=434130, bsz=16388, num_updates=6900, lr=0.000761387, gnorm=0.319, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=6706 epoch 005: 255 / 1689 loss=4.526, nll_loss=2.938, ppl=7.66, wps=454600, ups=1.05, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.328, clip=0, loss_scale=2, train_wall=94, gb_free=20.7, wall=6801 epoch 005: 255 / 1689 loss=4.526, nll_loss=2.938, ppl=7.66, wps=454600, ups=1.05, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.328, clip=0, loss_scale=2, train_wall=94, gb_free=20.7, wall=6801 epoch 005: 255 / 1689 loss=4.526, nll_loss=2.938, ppl=7.66, wps=454600, ups=1.05, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.328, clip=0, loss_scale=2, train_wall=94, gb_free=20.7, wall=6801 epoch 005: 255 / 1689 loss=4.526, nll_loss=2.938, ppl=7.66, wps=454600, ups=1.05, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.328, clip=0, loss_scale=2, train_wall=94, gb_free=20.7, wall=6801 epoch 005: 255 / 1689 loss=4.526, nll_loss=2.938, ppl=7.66, wps=454600, ups=1.05, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.328, clip=0, loss_scale=2, train_wall=94, gb_free=20.7, wall=6801 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.502 | nll_loss 2.859 | ppl 7.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.502 epoch 005 | valid on 'valid' subset | loss 4.502 | nll_loss 2.859 | ppl 7.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.502 epoch 005 | valid on 'valid' subset | loss 4.502 | nll_loss 2.859 | ppl 7.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.502 epoch 005 | valid on 'valid' subset | loss 4.502 | nll_loss 2.859 | ppl 7.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.502 epoch 005 | valid on 'valid' subset | loss 4.502 | nll_loss 2.859 | ppl 7.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.502 epoch 005: 355 / 1689 loss=4.516, nll_loss=2.927, ppl=7.61, wps=181608, ups=0.42, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.314, clip=0, loss_scale=2, train_wall=196, gb_free=19, wall=7040 epoch 005: 355 / 1689 loss=4.516, nll_loss=2.927, ppl=7.61, wps=181608, ups=0.42, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.314, clip=0, loss_scale=2, train_wall=196, gb_free=19, wall=7040 epoch 005: 355 / 1689 loss=4.516, nll_loss=2.927, ppl=7.61, wps=181608, ups=0.42, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.314, clip=0, loss_scale=2, train_wall=196, gb_free=19, wall=7040 epoch 005: 355 / 1689 loss=4.516, nll_loss=2.927, ppl=7.61, wps=181608, ups=0.42, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.314, clip=0, loss_scale=2, train_wall=196, gb_free=19, wall=7040 epoch 005: 355 / 1689 loss=4.516, nll_loss=2.927, ppl=7.61, wps=181608, ups=0.42, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.314, clip=0, loss_scale=2, train_wall=196, gb_free=19, wall=7040 epoch 005: 455 / 1689 loss=4.524, nll_loss=2.937, ppl=7.66, wps=464398, ups=1.07, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=7133 epoch 005: 455 / 1689 loss=4.524, nll_loss=2.937, ppl=7.66, wps=464398, ups=1.07, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=7133 epoch 005: 455 / 1689 loss=4.524, nll_loss=2.937, ppl=7.66, wps=464398, ups=1.07, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=7133 epoch 005: 455 / 1689 loss=4.524, nll_loss=2.937, ppl=7.66, wps=464398, ups=1.07, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=7133 epoch 005: 455 / 1689 loss=4.524, nll_loss=2.937, ppl=7.66, wps=464398, ups=1.07, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=7133 epoch 005: 555 / 1689 loss=4.512, nll_loss=2.924, ppl=7.59, wps=462327, ups=1.07, wpb=434096, bsz=16627.2, num_updates=7300, lr=0.000740233, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=7227 epoch 005: 555 / 1689 loss=4.512, nll_loss=2.924, ppl=7.59, wps=462327, ups=1.07, wpb=434096, bsz=16627.2, num_updates=7300, lr=0.000740233, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=7227 epoch 005: 555 / 1689 loss=4.512, nll_loss=2.924, ppl=7.59, wps=462327, ups=1.07, wpb=434096, bsz=16627.2, num_updates=7300, lr=0.000740233, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=7227 epoch 005: 555 / 1689 loss=4.512, nll_loss=2.924, ppl=7.59, wps=462327, ups=1.07, wpb=434096, bsz=16627.2, num_updates=7300, lr=0.000740233, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=7227 epoch 005: 555 / 1689 loss=4.512, nll_loss=2.924, ppl=7.59, wps=462327, ups=1.07, wpb=434096, bsz=16627.2, num_updates=7300, lr=0.000740233, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=7227 epoch 005: 655 / 1689 loss=4.517, nll_loss=2.93, ppl=7.62, wps=458821, ups=1.06, wpb=433115, bsz=16423.4, num_updates=7400, lr=0.000735215, gnorm=0.306, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=7322 epoch 005: 655 / 1689 loss=4.517, nll_loss=2.93, ppl=7.62, wps=458821, ups=1.06, wpb=433115, bsz=16423.4, num_updates=7400, lr=0.000735215, gnorm=0.306, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=7322 epoch 005: 655 / 1689 loss=4.517, nll_loss=2.93, ppl=7.62, wps=458821, ups=1.06, wpb=433115, bsz=16423.4, num_updates=7400, lr=0.000735215, gnorm=0.306, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=7322 epoch 005: 655 / 1689 loss=4.517, nll_loss=2.93, ppl=7.62, wps=458821, ups=1.06, wpb=433115, bsz=16423.4, num_updates=7400, lr=0.000735215, gnorm=0.306, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=7322 epoch 005: 655 / 1689 loss=4.517, nll_loss=2.93, ppl=7.62, wps=458821, ups=1.06, wpb=433115, bsz=16423.4, num_updates=7400, lr=0.000735215, gnorm=0.306, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=7322 epoch 005: 755 / 1689 loss=4.511, nll_loss=2.924, ppl=7.59, wps=459081, ups=1.06, wpb=431502, bsz=16385, num_updates=7500, lr=0.000730297, gnorm=0.299, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=7416 epoch 005: 755 / 1689 loss=4.511, nll_loss=2.924, ppl=7.59, wps=459081, ups=1.06, wpb=431502, bsz=16385, num_updates=7500, lr=0.000730297, gnorm=0.299, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=7416 epoch 005: 755 / 1689 loss=4.511, nll_loss=2.924, ppl=7.59, wps=459081, ups=1.06, wpb=431502, bsz=16385, num_updates=7500, lr=0.000730297, gnorm=0.299, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=7416 epoch 005: 755 / 1689 loss=4.511, nll_loss=2.924, ppl=7.59, wps=459081, ups=1.06, wpb=431502, bsz=16385, num_updates=7500, lr=0.000730297, gnorm=0.299, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=7416 epoch 005: 755 / 1689 loss=4.511, nll_loss=2.924, ppl=7.59, wps=459081, ups=1.06, wpb=431502, bsz=16385, num_updates=7500, lr=0.000730297, gnorm=0.299, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=7416 epoch 005: 856 / 1689 loss=4.507, nll_loss=2.919, ppl=7.56, wps=454078, ups=1.04, wpb=435132, bsz=16554.5, num_updates=7600, lr=0.000725476, gnorm=0.309, clip=0, loss_scale=2, train_wall=95, gb_free=14.6, wall=7512 epoch 005: 856 / 1689 loss=4.507, nll_loss=2.919, ppl=7.56, wps=454078, ups=1.04, wpb=435132, bsz=16554.5, num_updates=7600, lr=0.000725476, gnorm=0.309, clip=0, loss_scale=2, train_wall=95, gb_free=14.6, wall=7512 epoch 005: 856 / 1689 loss=4.507, nll_loss=2.919, ppl=7.56, wps=454078, ups=1.04, wpb=435132, bsz=16554.5, num_updates=7600, lr=0.000725476, gnorm=0.309, clip=0, loss_scale=2, train_wall=95, gb_free=14.6, wall=7512 epoch 005: 856 / 1689 loss=4.507, nll_loss=2.919, ppl=7.56, wps=454078, ups=1.04, wpb=435132, bsz=16554.5, num_updates=7600, lr=0.000725476, gnorm=0.309, clip=0, loss_scale=2, train_wall=95, gb_free=14.6, wall=7512 epoch 005: 856 / 1689 loss=4.507, nll_loss=2.919, ppl=7.56, wps=454078, ups=1.04, wpb=435132, bsz=16554.5, num_updates=7600, lr=0.000725476, gnorm=0.309, clip=0, loss_scale=2, train_wall=95, gb_free=14.6, wall=7512 epoch 005: 956 / 1689 loss=4.509, nll_loss=2.922, ppl=7.58, wps=460648, ups=1.06, wpb=435536, bsz=16452.2, num_updates=7700, lr=0.00072075, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=7606 epoch 005: 956 / 1689 loss=4.509, nll_loss=2.922, ppl=7.58, wps=460648, ups=1.06, wpb=435536, bsz=16452.2, num_updates=7700, lr=0.00072075, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=7606 epoch 005: 956 / 1689 loss=4.509, nll_loss=2.922, ppl=7.58, wps=460648, ups=1.06, wpb=435536, bsz=16452.2, num_updates=7700, lr=0.00072075, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=7606 epoch 005: 956 / 1689 loss=4.509, nll_loss=2.922, ppl=7.58, wps=460648, ups=1.06, wpb=435536, bsz=16452.2, num_updates=7700, lr=0.00072075, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=7606 epoch 005: 956 / 1689 loss=4.509, nll_loss=2.922, ppl=7.58, wps=460648, ups=1.06, wpb=435536, bsz=16452.2, num_updates=7700, lr=0.00072075, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=7606 epoch 005: 1056 / 1689 loss=4.491, nll_loss=2.902, ppl=7.47, wps=459314, ups=1.06, wpb=433987, bsz=16660.3, num_updates=7800, lr=0.000716115, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7701 epoch 005: 1056 / 1689 loss=4.491, nll_loss=2.902, ppl=7.47, wps=459314, ups=1.06, wpb=433987, bsz=16660.3, num_updates=7800, lr=0.000716115, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7701 epoch 005: 1056 / 1689 loss=4.491, nll_loss=2.902, ppl=7.47, wps=459314, ups=1.06, wpb=433987, bsz=16660.3, num_updates=7800, lr=0.000716115, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7701 epoch 005: 1056 / 1689 loss=4.491, nll_loss=2.902, ppl=7.47, wps=459314, ups=1.06, wpb=433987, bsz=16660.3, num_updates=7800, lr=0.000716115, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7701 epoch 005: 1056 / 1689 loss=4.491, nll_loss=2.902, ppl=7.47, wps=459314, ups=1.06, wpb=433987, bsz=16660.3, num_updates=7800, lr=0.000716115, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7701 epoch 005: 1156 / 1689 loss=4.495, nll_loss=2.906, ppl=7.49, wps=456609, ups=1.05, wpb=435261, bsz=16685.9, num_updates=7900, lr=0.000711568, gnorm=0.297, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=7796 epoch 005: 1156 / 1689 loss=4.495, nll_loss=2.906, ppl=7.49, wps=456609, ups=1.05, wpb=435261, bsz=16685.9, num_updates=7900, lr=0.000711568, gnorm=0.297, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=7796 epoch 005: 1156 / 1689 loss=4.495, nll_loss=2.906, ppl=7.49, wps=456609, ups=1.05, wpb=435261, bsz=16685.9, num_updates=7900, lr=0.000711568, gnorm=0.297, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=7796 epoch 005: 1156 / 1689 loss=4.495, nll_loss=2.906, ppl=7.49, wps=456609, ups=1.05, wpb=435261, bsz=16685.9, num_updates=7900, lr=0.000711568, gnorm=0.297, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=7796 epoch 005: 1156 / 1689 loss=4.495, nll_loss=2.906, ppl=7.49, wps=456609, ups=1.05, wpb=435261, bsz=16685.9, num_updates=7900, lr=0.000711568, gnorm=0.297, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=7796 epoch 005: 1256 / 1689 loss=4.494, nll_loss=2.905, ppl=7.49, wps=455631, ups=1.05, wpb=434156, bsz=16362.6, num_updates=8000, lr=0.000707107, gnorm=0.304, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=7891 epoch 005: 1256 / 1689 loss=4.494, nll_loss=2.905, ppl=7.49, wps=455631, ups=1.05, wpb=434156, bsz=16362.6, num_updates=8000, lr=0.000707107, gnorm=0.304, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=7891 epoch 005: 1256 / 1689 loss=4.494, nll_loss=2.905, ppl=7.49, wps=455631, ups=1.05, wpb=434156, bsz=16362.6, num_updates=8000, lr=0.000707107, gnorm=0.304, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=7891 epoch 005: 1256 / 1689 loss=4.494, nll_loss=2.905, ppl=7.49, wps=455631, ups=1.05, wpb=434156, bsz=16362.6, num_updates=8000, lr=0.000707107, gnorm=0.304, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=7891 epoch 005: 1256 / 1689 loss=4.494, nll_loss=2.905, ppl=7.49, wps=455631, ups=1.05, wpb=434156, bsz=16362.6, num_updates=8000, lr=0.000707107, gnorm=0.304, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=7891 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.445 | nll_loss 2.805 | ppl 6.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.445 epoch 005 | valid on 'valid' subset | loss 4.445 | nll_loss 2.805 | ppl 6.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.445 epoch 005 | valid on 'valid' subset | loss 4.445 | nll_loss 2.805 | ppl 6.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.445 epoch 005 | valid on 'valid' subset | loss 4.445 | nll_loss 2.805 | ppl 6.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.445 epoch 005 | valid on 'valid' subset | loss 4.445 | nll_loss 2.805 | ppl 6.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.445 epoch 005: 1356 / 1689 loss=4.491, nll_loss=2.903, ppl=7.48, wps=380804, ups=0.88, wpb=434844, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.291, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=8005 epoch 005: 1356 / 1689 loss=4.491, nll_loss=2.903, ppl=7.48, wps=380804, ups=0.88, wpb=434844, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.291, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=8005 epoch 005: 1356 / 1689 loss=4.491, nll_loss=2.903, ppl=7.48, wps=380804, ups=0.88, wpb=434844, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.291, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=8005 epoch 005: 1356 / 1689 loss=4.491, nll_loss=2.903, ppl=7.48, wps=380804, ups=0.88, wpb=434844, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.291, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=8005 epoch 005: 1356 / 1689 loss=4.491, nll_loss=2.903, ppl=7.48, wps=380804, ups=0.88, wpb=434844, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.291, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=8005 epoch 005: 1456 / 1689 loss=4.488, nll_loss=2.899, ppl=7.46, wps=462929, ups=1.07, wpb=433482, bsz=16498.8, num_updates=8200, lr=0.00069843, gnorm=0.303, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=8099 epoch 005: 1456 / 1689 loss=4.488, nll_loss=2.899, ppl=7.46, wps=462929, ups=1.07, wpb=433482, bsz=16498.8, num_updates=8200, lr=0.00069843, gnorm=0.303, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=8099 epoch 005: 1456 / 1689 loss=4.488, nll_loss=2.899, ppl=7.46, wps=462929, ups=1.07, wpb=433482, bsz=16498.8, num_updates=8200, lr=0.00069843, gnorm=0.303, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=8099 epoch 005: 1456 / 1689 loss=4.488, nll_loss=2.899, ppl=7.46, wps=462929, ups=1.07, wpb=433482, bsz=16498.8, num_updates=8200, lr=0.00069843, gnorm=0.303, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=8099 epoch 005: 1456 / 1689 loss=4.488, nll_loss=2.899, ppl=7.46, wps=462929, ups=1.07, wpb=433482, bsz=16498.8, num_updates=8200, lr=0.00069843, gnorm=0.303, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=8099 epoch 005: 1556 / 1689 loss=4.48, nll_loss=2.89, ppl=7.41, wps=458953, ups=1.06, wpb=432125, bsz=16183.8, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=8193 epoch 005: 1556 / 1689 loss=4.48, nll_loss=2.89, ppl=7.41, wps=458953, ups=1.06, wpb=432125, bsz=16183.8, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=8193 epoch 005: 1556 / 1689 loss=4.48, nll_loss=2.89, ppl=7.41, wps=458953, ups=1.06, wpb=432125, bsz=16183.8, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=8193 epoch 005: 1556 / 1689 loss=4.48, nll_loss=2.89, ppl=7.41, wps=458953, ups=1.06, wpb=432125, bsz=16183.8, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=8193 epoch 005: 1556 / 1689 loss=4.48, nll_loss=2.89, ppl=7.41, wps=458953, ups=1.06, wpb=432125, bsz=16183.8, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=8193 epoch 005: 1656 / 1689 loss=4.486, nll_loss=2.898, ppl=7.45, wps=460908, ups=1.07, wpb=432628, bsz=16823.9, num_updates=8400, lr=0.000690066, gnorm=0.294, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=8287 epoch 005: 1656 / 1689 loss=4.486, nll_loss=2.898, ppl=7.45, wps=460908, ups=1.07, wpb=432628, bsz=16823.9, num_updates=8400, lr=0.000690066, gnorm=0.294, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=8287 epoch 005: 1656 / 1689 loss=4.486, nll_loss=2.898, ppl=7.45, wps=460908, ups=1.07, wpb=432628, bsz=16823.9, num_updates=8400, lr=0.000690066, gnorm=0.294, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=8287 epoch 005: 1656 / 1689 loss=4.486, nll_loss=2.898, ppl=7.45, wps=460908, ups=1.07, wpb=432628, bsz=16823.9, num_updates=8400, lr=0.000690066, gnorm=0.294, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=8287 epoch 005: 1656 / 1689 loss=4.486, nll_loss=2.898, ppl=7.45, wps=460908, ups=1.07, wpb=432628, bsz=16823.9, num_updates=8400, lr=0.000690066, gnorm=0.294, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=8287 end of epoch 5 (average epoch stats below) epoch 005 | loss 4.505 | nll_loss 2.917 | ppl 7.55 | wps 415606 | ups 0.96 | wpb 433519 | bsz 16504.9 | num_updates 8433 | lr 0.000688714 | gnorm 0.304 | clip 0 | loss_scale 4 | train_wall 1680 | gb_free 23.8 | wall 8317 epoch 005 | loss 4.505 | nll_loss 2.917 | ppl 7.55 | wps 415606 | ups 0.96 | wpb 433519 | bsz 16504.9 | num_updates 8433 | lr 0.000688714 | gnorm 0.304 | clip 0 | loss_scale 4 | train_wall 1680 | gb_free 23.8 | wall 8317 epoch 005 | loss 4.505 | nll_loss 2.917 | ppl 7.55 | wps 415606 | ups 0.96 | wpb 433519 | bsz 16504.9 | num_updates 8433 | lr 0.000688714 | gnorm 0.304 | clip 0 | loss_scale 4 | train_wall 1680 | gb_free 23.8 | wall 8317 epoch 005 | loss 4.505 | nll_loss 2.917 | ppl 7.55 | wps 415606 | ups 0.96 | wpb 433519 | bsz 16504.9 | num_updates 8433 | lr 0.000688714 | gnorm 0.304 | clip 0 | loss_scale 4 | train_wall 1680 | gb_free 23.8 | wall 8317 epoch 005 | loss 4.505 | nll_loss 2.917 | ppl 7.55 | wps 415606 | ups 0.96 | wpb 433519 | bsz 16504.9 | num_updates 8433 | lr 0.000688714 | gnorm 0.304 | clip 0 | loss_scale 4 | train_wall 1680 | gb_free 23.8 | wall 8317 Start iterating over samples epoch 006: 67 / 1689 loss=4.447, nll_loss=2.852, ppl=7.22, wps=457018, ups=1.06, wpb=430161, bsz=16103.7, num_updates=8500, lr=0.000685994, gnorm=0.292, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=8381 epoch 006: 67 / 1689 loss=4.447, nll_loss=2.852, ppl=7.22, wps=457018, ups=1.06, wpb=430161, bsz=16103.7, num_updates=8500, lr=0.000685994, gnorm=0.292, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=8381 epoch 006: 67 / 1689 loss=4.447, nll_loss=2.852, ppl=7.22, wps=457018, ups=1.06, wpb=430161, bsz=16103.7, num_updates=8500, lr=0.000685994, gnorm=0.292, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=8381 epoch 006: 67 / 1689 loss=4.447, nll_loss=2.852, ppl=7.22, wps=457018, ups=1.06, wpb=430161, bsz=16103.7, num_updates=8500, lr=0.000685994, gnorm=0.292, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=8381 epoch 006: 67 / 1689 loss=4.447, nll_loss=2.852, ppl=7.22, wps=457018, ups=1.06, wpb=430161, bsz=16103.7, num_updates=8500, lr=0.000685994, gnorm=0.292, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=8381 epoch 006: 67 / 1689 loss=4.447, nll_loss=2.852, ppl=7.22, wps=457018, ups=1.06, wpb=430161, bsz=16103.7, num_updates=8500, lr=0.000685994, gnorm=0.292, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=8381 epoch 006: 167 / 1689 loss=4.451, nll_loss=2.857, ppl=7.24, wps=461489, ups=1.06, wpb=434768, bsz=16317.4, num_updates=8600, lr=0.000681994, gnorm=0.286, clip=0, loss_scale=8, train_wall=93, gb_free=18.3, wall=8475 epoch 006: 167 / 1689 loss=4.451, nll_loss=2.857, ppl=7.24, wps=461489, ups=1.06, wpb=434768, bsz=16317.4, num_updates=8600, lr=0.000681994, gnorm=0.286, clip=0, loss_scale=8, train_wall=93, gb_free=18.3, wall=8475 epoch 006: 167 / 1689 loss=4.451, nll_loss=2.857, ppl=7.24, wps=461489, ups=1.06, wpb=434768, bsz=16317.4, num_updates=8600, lr=0.000681994, gnorm=0.286, clip=0, loss_scale=8, train_wall=93, gb_free=18.3, wall=8475 epoch 006: 167 / 1689 loss=4.451, nll_loss=2.857, ppl=7.24, wps=461489, ups=1.06, wpb=434768, bsz=16317.4, num_updates=8600, lr=0.000681994, gnorm=0.286, clip=0, loss_scale=8, train_wall=93, gb_free=18.3, wall=8475 epoch 006: 167 / 1689 loss=4.451, nll_loss=2.857, ppl=7.24, wps=461489, ups=1.06, wpb=434768, bsz=16317.4, num_updates=8600, lr=0.000681994, gnorm=0.286, clip=0, loss_scale=8, train_wall=93, gb_free=18.3, wall=8475 epoch 006: 167 / 1689 loss=4.451, nll_loss=2.857, ppl=7.24, wps=461489, ups=1.06, wpb=434768, bsz=16317.4, num_updates=8600, lr=0.000681994, gnorm=0.286, clip=0, loss_scale=8, train_wall=93, gb_free=18.3, wall=8475 epoch 006: 268 / 1689 loss=4.44, nll_loss=2.845, ppl=7.18, wps=451711, ups=1.04, wpb=433100, bsz=16643.1, num_updates=8700, lr=0.000678064, gnorm=0.287, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=8571 epoch 006: 268 / 1689 loss=4.44, nll_loss=2.845, ppl=7.18, wps=451711, ups=1.04, wpb=433100, bsz=16643.1, num_updates=8700, lr=0.000678064, gnorm=0.287, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=8571 epoch 006: 268 / 1689 loss=4.44, nll_loss=2.845, ppl=7.18, wps=451711, ups=1.04, wpb=433100, bsz=16643.1, num_updates=8700, lr=0.000678064, gnorm=0.287, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=8571 epoch 006: 268 / 1689 loss=4.44, nll_loss=2.845, ppl=7.18, wps=451711, ups=1.04, wpb=433100, bsz=16643.1, num_updates=8700, lr=0.000678064, gnorm=0.287, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=8571 epoch 006: 268 / 1689 loss=4.44, nll_loss=2.845, ppl=7.18, wps=451711, ups=1.04, wpb=433100, bsz=16643.1, num_updates=8700, lr=0.000678064, gnorm=0.287, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=8571 epoch 006: 268 / 1689 loss=4.44, nll_loss=2.845, ppl=7.18, wps=451711, ups=1.04, wpb=433100, bsz=16643.1, num_updates=8700, lr=0.000678064, gnorm=0.287, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=8571 epoch 006: 368 / 1689 loss=4.45, nll_loss=2.856, ppl=7.24, wps=458253, ups=1.06, wpb=433246, bsz=16359.8, num_updates=8800, lr=0.0006742, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8666 epoch 006: 368 / 1689 loss=4.45, nll_loss=2.856, ppl=7.24, wps=458253, ups=1.06, wpb=433246, bsz=16359.8, num_updates=8800, lr=0.0006742, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8666 epoch 006: 368 / 1689 loss=4.45, nll_loss=2.856, ppl=7.24, wps=458253, ups=1.06, wpb=433246, bsz=16359.8, num_updates=8800, lr=0.0006742, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8666 epoch 006: 368 / 1689 loss=4.45, nll_loss=2.856, ppl=7.24, wps=458253, ups=1.06, wpb=433246, bsz=16359.8, num_updates=8800, lr=0.0006742, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8666 epoch 006: 368 / 1689 loss=4.45, nll_loss=2.856, ppl=7.24, wps=458253, ups=1.06, wpb=433246, bsz=16359.8, num_updates=8800, lr=0.0006742, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8666 epoch 006: 368 / 1689 loss=4.45, nll_loss=2.856, ppl=7.24, wps=458253, ups=1.06, wpb=433246, bsz=16359.8, num_updates=8800, lr=0.0006742, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8666 epoch 006: 468 / 1689 loss=4.443, nll_loss=2.849, ppl=7.2, wps=454177, ups=1.05, wpb=432759, bsz=16446.9, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=8761 epoch 006: 468 / 1689 loss=4.443, nll_loss=2.849, ppl=7.2, wps=454177, ups=1.05, wpb=432759, bsz=16446.9, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=8761 epoch 006: 468 / 1689 loss=4.443, nll_loss=2.849, ppl=7.2, wps=454177, ups=1.05, wpb=432759, bsz=16446.9, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=8761 epoch 006: 468 / 1689 loss=4.443, nll_loss=2.849, ppl=7.2, wps=454177, ups=1.05, wpb=432759, bsz=16446.9, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=8761 epoch 006: 468 / 1689 loss=4.443, nll_loss=2.849, ppl=7.2, wps=454177, ups=1.05, wpb=432759, bsz=16446.9, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=8761 epoch 006: 468 / 1689 loss=4.443, nll_loss=2.849, ppl=7.2, wps=454177, ups=1.05, wpb=432759, bsz=16446.9, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=8761 epoch 006: 568 / 1689 loss=4.44, nll_loss=2.846, ppl=7.19, wps=461547, ups=1.06, wpb=434642, bsz=16585.1, num_updates=9000, lr=0.000666667, gnorm=0.292, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=8855 epoch 006: 568 / 1689 loss=4.44, nll_loss=2.846, ppl=7.19, wps=461547, ups=1.06, wpb=434642, bsz=16585.1, num_updates=9000, lr=0.000666667, gnorm=0.292, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=8855 epoch 006: 568 / 1689 loss=4.44, nll_loss=2.846, ppl=7.19, wps=461547, ups=1.06, wpb=434642, bsz=16585.1, num_updates=9000, lr=0.000666667, gnorm=0.292, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=8855 epoch 006: 568 / 1689 loss=4.44, nll_loss=2.846, ppl=7.19, wps=461547, ups=1.06, wpb=434642, bsz=16585.1, num_updates=9000, lr=0.000666667, gnorm=0.292, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=8855 epoch 006: 568 / 1689 loss=4.44, nll_loss=2.846, ppl=7.19, wps=461547, ups=1.06, wpb=434642, bsz=16585.1, num_updates=9000, lr=0.000666667, gnorm=0.292, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=8855 epoch 006: 568 / 1689 loss=4.44, nll_loss=2.846, ppl=7.19, wps=461547, ups=1.06, wpb=434642, bsz=16585.1, num_updates=9000, lr=0.000666667, gnorm=0.292, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=8855 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.418 | nll_loss 2.784 | ppl 6.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.418 epoch 006 | valid on 'valid' subset | loss 4.418 | nll_loss 2.784 | ppl 6.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.418 epoch 006 | valid on 'valid' subset | loss 4.418 | nll_loss 2.784 | ppl 6.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.418 epoch 006 | valid on 'valid' subset | loss 4.418 | nll_loss 2.784 | ppl 6.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.418 epoch 006 | valid on 'valid' subset | loss 4.418 | nll_loss 2.784 | ppl 6.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.418 epoch 006 | valid on 'valid' subset | loss 4.418 | nll_loss 2.784 | ppl 6.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.418 epoch 006: 668 / 1689 loss=4.445, nll_loss=2.851, ppl=7.22, wps=285383, ups=0.66, wpb=432332, bsz=16809, num_updates=9100, lr=0.000662994, gnorm=0.276, clip=0, loss_scale=4, train_wall=125, gb_free=19.9, wall=9007 epoch 006: 668 / 1689 loss=4.445, nll_loss=2.851, ppl=7.22, wps=285383, ups=0.66, wpb=432332, bsz=16809, num_updates=9100, lr=0.000662994, gnorm=0.276, clip=0, loss_scale=4, train_wall=125, gb_free=19.9, wall=9007 epoch 006: 668 / 1689 loss=4.445, nll_loss=2.851, ppl=7.22, wps=285383, ups=0.66, wpb=432332, bsz=16809, num_updates=9100, lr=0.000662994, gnorm=0.276, clip=0, loss_scale=4, train_wall=125, gb_free=19.9, wall=9007 epoch 006: 668 / 1689 loss=4.445, nll_loss=2.851, ppl=7.22, wps=285383, ups=0.66, wpb=432332, bsz=16809, num_updates=9100, lr=0.000662994, gnorm=0.276, clip=0, loss_scale=4, train_wall=125, gb_free=19.9, wall=9007 epoch 006: 668 / 1689 loss=4.445, nll_loss=2.851, ppl=7.22, wps=285383, ups=0.66, wpb=432332, bsz=16809, num_updates=9100, lr=0.000662994, gnorm=0.276, clip=0, loss_scale=4, train_wall=125, gb_free=19.9, wall=9007 epoch 006: 668 / 1689 loss=4.445, nll_loss=2.851, ppl=7.22, wps=285383, ups=0.66, wpb=432332, bsz=16809, num_updates=9100, lr=0.000662994, gnorm=0.276, clip=0, loss_scale=4, train_wall=125, gb_free=19.9, wall=9007 epoch 006: 769 / 1689 loss=4.435, nll_loss=2.841, ppl=7.16, wps=456665, ups=1.05, wpb=433222, bsz=16414.8, num_updates=9200, lr=0.00065938, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=9102 epoch 006: 769 / 1689 loss=4.435, nll_loss=2.841, ppl=7.16, wps=456665, ups=1.05, wpb=433222, bsz=16414.8, num_updates=9200, lr=0.00065938, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=9102 epoch 006: 769 / 1689 loss=4.435, nll_loss=2.841, ppl=7.16, wps=456665, ups=1.05, wpb=433222, bsz=16414.8, num_updates=9200, lr=0.00065938, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=9102 epoch 006: 769 / 1689 loss=4.435, nll_loss=2.841, ppl=7.16, wps=456665, ups=1.05, wpb=433222, bsz=16414.8, num_updates=9200, lr=0.00065938, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=9102 epoch 006: 769 / 1689 loss=4.435, nll_loss=2.841, ppl=7.16, wps=456665, ups=1.05, wpb=433222, bsz=16414.8, num_updates=9200, lr=0.00065938, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=9102 epoch 006: 769 / 1689 loss=4.435, nll_loss=2.841, ppl=7.16, wps=456665, ups=1.05, wpb=433222, bsz=16414.8, num_updates=9200, lr=0.00065938, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=9102 epoch 006: 869 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=455674, ups=1.05, wpb=433930, bsz=16856.6, num_updates=9300, lr=0.000655826, gnorm=0.283, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=9197 epoch 006: 869 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=455674, ups=1.05, wpb=433930, bsz=16856.6, num_updates=9300, lr=0.000655826, gnorm=0.283, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=9197 epoch 006: 869 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=455674, ups=1.05, wpb=433930, bsz=16856.6, num_updates=9300, lr=0.000655826, gnorm=0.283, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=9197 epoch 006: 869 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=455674, ups=1.05, wpb=433930, bsz=16856.6, num_updates=9300, lr=0.000655826, gnorm=0.283, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=9197 epoch 006: 869 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=455674, ups=1.05, wpb=433930, bsz=16856.6, num_updates=9300, lr=0.000655826, gnorm=0.283, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=9197 epoch 006: 869 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=455674, ups=1.05, wpb=433930, bsz=16856.6, num_updates=9300, lr=0.000655826, gnorm=0.283, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=9197 epoch 006: 969 / 1689 loss=4.438, nll_loss=2.844, ppl=7.18, wps=458257, ups=1.05, wpb=434430, bsz=16272.5, num_updates=9400, lr=0.000652328, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9292 epoch 006: 969 / 1689 loss=4.438, nll_loss=2.844, ppl=7.18, wps=458257, ups=1.05, wpb=434430, bsz=16272.5, num_updates=9400, lr=0.000652328, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9292 epoch 006: 969 / 1689 loss=4.438, nll_loss=2.844, ppl=7.18, wps=458257, ups=1.05, wpb=434430, bsz=16272.5, num_updates=9400, lr=0.000652328, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9292 epoch 006: 969 / 1689 loss=4.438, nll_loss=2.844, ppl=7.18, wps=458257, ups=1.05, wpb=434430, bsz=16272.5, num_updates=9400, lr=0.000652328, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9292 epoch 006: 969 / 1689 loss=4.438, nll_loss=2.844, ppl=7.18, wps=458257, ups=1.05, wpb=434430, bsz=16272.5, num_updates=9400, lr=0.000652328, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9292 epoch 006: 969 / 1689 loss=4.438, nll_loss=2.844, ppl=7.18, wps=458257, ups=1.05, wpb=434430, bsz=16272.5, num_updates=9400, lr=0.000652328, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9292 epoch 006: 1069 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=459934, ups=1.06, wpb=432970, bsz=16418.2, num_updates=9500, lr=0.000648886, gnorm=0.289, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=9386 epoch 006: 1069 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=459934, ups=1.06, wpb=432970, bsz=16418.2, num_updates=9500, lr=0.000648886, gnorm=0.289, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=9386 epoch 006: 1069 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=459934, ups=1.06, wpb=432970, bsz=16418.2, num_updates=9500, lr=0.000648886, gnorm=0.289, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=9386 epoch 006: 1069 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=459934, ups=1.06, wpb=432970, bsz=16418.2, num_updates=9500, lr=0.000648886, gnorm=0.289, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=9386 epoch 006: 1069 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=459934, ups=1.06, wpb=432970, bsz=16418.2, num_updates=9500, lr=0.000648886, gnorm=0.289, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=9386 epoch 006: 1069 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=459934, ups=1.06, wpb=432970, bsz=16418.2, num_updates=9500, lr=0.000648886, gnorm=0.289, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=9386 epoch 006: 1169 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459240, ups=1.06, wpb=434277, bsz=16734.2, num_updates=9600, lr=0.000645497, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9480 epoch 006: 1169 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459240, ups=1.06, wpb=434277, bsz=16734.2, num_updates=9600, lr=0.000645497, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9480 epoch 006: 1169 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459240, ups=1.06, wpb=434277, bsz=16734.2, num_updates=9600, lr=0.000645497, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9480 epoch 006: 1169 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459240, ups=1.06, wpb=434277, bsz=16734.2, num_updates=9600, lr=0.000645497, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9480 epoch 006: 1169 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459240, ups=1.06, wpb=434277, bsz=16734.2, num_updates=9600, lr=0.000645497, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9480 epoch 006: 1169 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459240, ups=1.06, wpb=434277, bsz=16734.2, num_updates=9600, lr=0.000645497, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9480 epoch 006: 1269 / 1689 loss=4.428, nll_loss=2.834, ppl=7.13, wps=459973, ups=1.06, wpb=433621, bsz=16242.5, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=9575 epoch 006: 1269 / 1689 loss=4.428, nll_loss=2.834, ppl=7.13, wps=459973, ups=1.06, wpb=433621, bsz=16242.5, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=9575 epoch 006: 1269 / 1689 loss=4.428, nll_loss=2.834, ppl=7.13, wps=459973, ups=1.06, wpb=433621, bsz=16242.5, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=9575 epoch 006: 1269 / 1689 loss=4.428, nll_loss=2.834, ppl=7.13, wps=459973, ups=1.06, wpb=433621, bsz=16242.5, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=9575 epoch 006: 1269 / 1689 loss=4.428, nll_loss=2.834, ppl=7.13, wps=459973, ups=1.06, wpb=433621, bsz=16242.5, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=9575 epoch 006: 1269 / 1689 loss=4.428, nll_loss=2.834, ppl=7.13, wps=459973, ups=1.06, wpb=433621, bsz=16242.5, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=9575 epoch 006: 1369 / 1689 loss=4.435, nll_loss=2.841, ppl=7.17, wps=458746, ups=1.06, wpb=432912, bsz=16698.2, num_updates=9800, lr=0.000638877, gnorm=0.285, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9669 epoch 006: 1369 / 1689 loss=4.435, nll_loss=2.841, ppl=7.17, wps=458746, ups=1.06, wpb=432912, bsz=16698.2, num_updates=9800, lr=0.000638877, gnorm=0.285, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9669 epoch 006: 1369 / 1689 loss=4.435, nll_loss=2.841, ppl=7.17, wps=458746, ups=1.06, wpb=432912, bsz=16698.2, num_updates=9800, lr=0.000638877, gnorm=0.285, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9669 epoch 006: 1369 / 1689 loss=4.435, nll_loss=2.841, ppl=7.17, wps=458746, ups=1.06, wpb=432912, bsz=16698.2, num_updates=9800, lr=0.000638877, gnorm=0.285, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9669 epoch 006: 1369 / 1689 loss=4.435, nll_loss=2.841, ppl=7.17, wps=458746, ups=1.06, wpb=432912, bsz=16698.2, num_updates=9800, lr=0.000638877, gnorm=0.285, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9669 epoch 006: 1369 / 1689 loss=4.435, nll_loss=2.841, ppl=7.17, wps=458746, ups=1.06, wpb=432912, bsz=16698.2, num_updates=9800, lr=0.000638877, gnorm=0.285, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9669 epoch 006: 1469 / 1689 loss=4.437, nll_loss=2.844, ppl=7.18, wps=458952, ups=1.05, wpb=436014, bsz=16293, num_updates=9900, lr=0.000635642, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9764 epoch 006: 1469 / 1689 loss=4.437, nll_loss=2.844, ppl=7.18, wps=458952, ups=1.05, wpb=436014, bsz=16293, num_updates=9900, lr=0.000635642, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9764 epoch 006: 1469 / 1689 loss=4.437, nll_loss=2.844, ppl=7.18, wps=458952, ups=1.05, wpb=436014, bsz=16293, num_updates=9900, lr=0.000635642, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9764 epoch 006: 1469 / 1689 loss=4.437, nll_loss=2.844, ppl=7.18, wps=458952, ups=1.05, wpb=436014, bsz=16293, num_updates=9900, lr=0.000635642, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9764 epoch 006: 1469 / 1689 loss=4.437, nll_loss=2.844, ppl=7.18, wps=458952, ups=1.05, wpb=436014, bsz=16293, num_updates=9900, lr=0.000635642, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9764 epoch 006: 1469 / 1689 loss=4.437, nll_loss=2.844, ppl=7.18, wps=458952, ups=1.05, wpb=436014, bsz=16293, num_updates=9900, lr=0.000635642, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=9764 epoch 006: 1569 / 1689 loss=4.431, nll_loss=2.838, ppl=7.15, wps=462174, ups=1.06, wpb=436219, bsz=16688, num_updates=10000, lr=0.000632456, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=9858 epoch 006: 1569 / 1689 loss=4.431, nll_loss=2.838, ppl=7.15, wps=462174, ups=1.06, wpb=436219, bsz=16688, num_updates=10000, lr=0.000632456, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=9858 epoch 006: 1569 / 1689 loss=4.431, nll_loss=2.838, ppl=7.15, wps=462174, ups=1.06, wpb=436219, bsz=16688, num_updates=10000, lr=0.000632456, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=9858 epoch 006: 1569 / 1689 loss=4.431, nll_loss=2.838, ppl=7.15, wps=462174, ups=1.06, wpb=436219, bsz=16688, num_updates=10000, lr=0.000632456, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=9858 epoch 006: 1569 / 1689 loss=4.431, nll_loss=2.838, ppl=7.15, wps=462174, ups=1.06, wpb=436219, bsz=16688, num_updates=10000, lr=0.000632456, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=9858 epoch 006: 1569 / 1689 loss=4.431, nll_loss=2.838, ppl=7.15, wps=462174, ups=1.06, wpb=436219, bsz=16688, num_updates=10000, lr=0.000632456, gnorm=0.269, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=9858 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.375 | nll_loss 2.739 | ppl 6.68 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.375 epoch 006 | valid on 'valid' subset | loss 4.375 | nll_loss 2.739 | ppl 6.68 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.375 epoch 006 | valid on 'valid' subset | loss 4.375 | nll_loss 2.739 | ppl 6.68 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.375 epoch 006 | valid on 'valid' subset | loss 4.375 | nll_loss 2.739 | ppl 6.68 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.375 epoch 006 | valid on 'valid' subset | loss 4.375 | nll_loss 2.739 | ppl 6.68 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.375 epoch 006 | valid on 'valid' subset | loss 4.375 | nll_loss 2.739 | ppl 6.68 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.375 epoch 006: 1669 / 1689 loss=4.412, nll_loss=2.817, ppl=7.05, wps=339544, ups=0.79, wpb=432170, bsz=16667.4, num_updates=10100, lr=0.000629317, gnorm=0.276, clip=0, loss_scale=4, train_wall=102, gb_free=19, wall=9986 epoch 006: 1669 / 1689 loss=4.412, nll_loss=2.817, ppl=7.05, wps=339544, ups=0.79, wpb=432170, bsz=16667.4, num_updates=10100, lr=0.000629317, gnorm=0.276, clip=0, loss_scale=4, train_wall=102, gb_free=19, wall=9986 epoch 006: 1669 / 1689 loss=4.412, nll_loss=2.817, ppl=7.05, wps=339544, ups=0.79, wpb=432170, bsz=16667.4, num_updates=10100, lr=0.000629317, gnorm=0.276, clip=0, loss_scale=4, train_wall=102, gb_free=19, wall=9986 epoch 006: 1669 / 1689 loss=4.412, nll_loss=2.817, ppl=7.05, wps=339544, ups=0.79, wpb=432170, bsz=16667.4, num_updates=10100, lr=0.000629317, gnorm=0.276, clip=0, loss_scale=4, train_wall=102, gb_free=19, wall=9986 epoch 006: 1669 / 1689 loss=4.412, nll_loss=2.817, ppl=7.05, wps=339544, ups=0.79, wpb=432170, bsz=16667.4, num_updates=10100, lr=0.000629317, gnorm=0.276, clip=0, loss_scale=4, train_wall=102, gb_free=19, wall=9986 epoch 006: 1669 / 1689 loss=4.412, nll_loss=2.817, ppl=7.05, wps=339544, ups=0.79, wpb=432170, bsz=16667.4, num_updates=10100, lr=0.000629317, gnorm=0.276, clip=0, loss_scale=4, train_wall=102, gb_free=19, wall=9986 end of epoch 6 (average epoch stats below) epoch 006 | loss 4.438 | nll_loss 2.844 | ppl 7.18 | wps 433628 | ups 1 | wpb 433533 | bsz 16505.7 | num_updates 10120 | lr 0.000628695 | gnorm 0.283 | clip 0 | loss_scale 4 | train_wall 1611 | gb_free 22.3 | wall 10004 epoch 006 | loss 4.438 | nll_loss 2.844 | ppl 7.18 | wps 433628 | ups 1 | wpb 433533 | bsz 16505.7 | num_updates 10120 | lr 0.000628695 | gnorm 0.283 | clip 0 | loss_scale 4 | train_wall 1611 | gb_free 22.3 | wall 10004 epoch 006 | loss 4.438 | nll_loss 2.844 | ppl 7.18 | wps 433628 | ups 1 | wpb 433533 | bsz 16505.7 | num_updates 10120 | lr 0.000628695 | gnorm 0.283 | clip 0 | loss_scale 4 | train_wall 1611 | gb_free 22.3 | wall 10004 epoch 006 | loss 4.438 | nll_loss 2.844 | ppl 7.18 | wps 433628 | ups 1 | wpb 433533 | bsz 16505.7 | num_updates 10120 | lr 0.000628695 | gnorm 0.283 | clip 0 | loss_scale 4 | train_wall 1611 | gb_free 22.3 | wall 10004 epoch 006 | loss 4.438 | nll_loss 2.844 | ppl 7.18 | wps 433628 | ups 1 | wpb 433533 | bsz 16505.7 | num_updates 10120 | lr 0.000628695 | gnorm 0.283 | clip 0 | loss_scale 4 | train_wall 1611 | gb_free 22.3 | wall 10004 epoch 006 | loss 4.438 | nll_loss 2.844 | ppl 7.18 | wps 433628 | ups 1 | wpb 433533 | bsz 16505.7 | num_updates 10120 | lr 0.000628695 | gnorm 0.283 | clip 0 | loss_scale 4 | train_wall 1611 | gb_free 22.3 | wall 10004 Start iterating over samples epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 81 / 1689 loss=4.394, nll_loss=2.795, ppl=6.94, wps=451824, ups=1.05, wpb=429298, bsz=16335.2, num_updates=10200, lr=0.000626224, gnorm=0.276, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=10081 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 181 / 1689 loss=4.384, nll_loss=2.784, ppl=6.89, wps=458642, ups=1.06, wpb=433456, bsz=16522.2, num_updates=10300, lr=0.000623177, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=10175 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 281 / 1689 loss=4.389, nll_loss=2.79, ppl=6.91, wps=460026, ups=1.06, wpb=434148, bsz=16574.6, num_updates=10400, lr=0.000620174, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=10270 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 381 / 1689 loss=4.4, nll_loss=2.802, ppl=6.97, wps=460046, ups=1.06, wpb=434948, bsz=16078.4, num_updates=10500, lr=0.000617213, gnorm=0.254, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10364 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 481 / 1689 loss=4.391, nll_loss=2.792, ppl=6.93, wps=456048, ups=1.05, wpb=433882, bsz=16605.8, num_updates=10600, lr=0.000614295, gnorm=0.271, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=10459 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 582 / 1689 loss=4.383, nll_loss=2.782, ppl=6.88, wps=452034, ups=1.05, wpb=431552, bsz=16380.6, num_updates=10700, lr=0.000611418, gnorm=0.269, clip=0, loss_scale=4, train_wall=94, gb_free=19.7, wall=10555 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 682 / 1689 loss=4.387, nll_loss=2.788, ppl=6.91, wps=454953, ups=1.05, wpb=434157, bsz=16344.6, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=10650 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 782 / 1689 loss=4.401, nll_loss=2.803, ppl=6.98, wps=456689, ups=1.05, wpb=433467, bsz=16377.2, num_updates=10900, lr=0.000605783, gnorm=0.263, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=10745 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 epoch 007: 883 / 1689 loss=4.401, nll_loss=2.804, ppl=6.98, wps=454409, ups=1.05, wpb=432994, bsz=16432.8, num_updates=11000, lr=0.000603023, gnorm=0.266, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10840 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007 | valid on 'valid' subset | loss 4.383 | nll_loss 2.749 | ppl 6.72 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.375 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 983 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=407454, ups=0.94, wpb=434175, bsz=16333.4, num_updates=11100, lr=0.0006003, gnorm=0.281, clip=0, loss_scale=2, train_wall=91, gb_free=19.8, wall=10947 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1083 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=458712, ups=1.06, wpb=432284, bsz=17018.7, num_updates=11200, lr=0.000597614, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=11041 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1183 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=461731, ups=1.06, wpb=435278, bsz=16659.9, num_updates=11300, lr=0.000594964, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=11135 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1283 / 1689 loss=4.388, nll_loss=2.79, ppl=6.92, wps=458712, ups=1.05, wpb=435747, bsz=16451.8, num_updates=11400, lr=0.000592349, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=11230 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1383 / 1689 loss=4.397, nll_loss=2.8, ppl=6.96, wps=460146, ups=1.06, wpb=433002, bsz=16373, num_updates=11500, lr=0.000589768, gnorm=0.252, clip=0, loss_scale=4, train_wall=93, gb_free=20.4, wall=11325 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1484 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=455004, ups=1.05, wpb=435171, bsz=16655.5, num_updates=11600, lr=0.00058722, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=11420 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1584 / 1689 loss=4.389, nll_loss=2.791, ppl=6.92, wps=453131, ups=1.05, wpb=433572, bsz=16897.4, num_updates=11700, lr=0.000584705, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=21, wall=11516 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 epoch 007: 1684 / 1689 loss=4.381, nll_loss=2.782, ppl=6.88, wps=455091, ups=1.05, wpb=432287, bsz=16444.5, num_updates=11800, lr=0.000582223, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=11611 end of epoch 7 (average epoch stats below) epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 epoch 007 | loss 4.389 | nll_loss 2.791 | ppl 6.92 | wps 453465 | ups 1.05 | wpb 433507 | bsz 16504.4 | num_updates 11805 | lr 0.000582099 | gnorm 0.263 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.3 | wall 11615 Start iterating over samples epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 95 / 1689 loss=4.34, nll_loss=2.735, ppl=6.66, wps=450744, ups=1.05, wpb=430631, bsz=16619.6, num_updates=11900, lr=0.000579771, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=11706 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 epoch 008: 195 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=465016, ups=1.07, wpb=434072, bsz=16554.1, num_updates=12000, lr=0.00057735, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=11800 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008 | valid on 'valid' subset | loss 4.354 | nll_loss 2.713 | ppl 6.56 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.354 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 295 / 1689 loss=4.357, nll_loss=2.754, ppl=6.75, wps=388230, ups=0.89, wpb=434704, bsz=16598.6, num_updates=12100, lr=0.00057496, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.3, wall=11912 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 395 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463791, ups=1.07, wpb=433657, bsz=16642.1, num_updates=12200, lr=0.000572598, gnorm=0.253, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=12005 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 495 / 1689 loss=4.343, nll_loss=2.739, ppl=6.67, wps=460795, ups=1.07, wpb=430976, bsz=16045.4, num_updates=12300, lr=0.000570266, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=12099 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 596 / 1689 loss=4.356, nll_loss=2.754, ppl=6.75, wps=457088, ups=1.06, wpb=430264, bsz=16370.7, num_updates=12400, lr=0.000567962, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12193 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 696 / 1689 loss=4.344, nll_loss=2.741, ppl=6.68, wps=456951, ups=1.05, wpb=433254, bsz=16599, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12288 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 796 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=461722, ups=1.06, wpb=434251, bsz=16372.3, num_updates=12600, lr=0.000563436, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=12382 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 896 / 1689 loss=4.367, nll_loss=2.767, ppl=6.8, wps=456641, ups=1.05, wpb=434970, bsz=16585.5, num_updates=12700, lr=0.000561214, gnorm=0.262, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=12477 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 996 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=458011, ups=1.05, wpb=436642, bsz=16375.6, num_updates=12800, lr=0.000559017, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=12572 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1096 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=457633, ups=1.05, wpb=434215, bsz=16411.8, num_updates=12900, lr=0.000556846, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=12667 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 epoch 008: 1196 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=457527, ups=1.05, wpb=434089, bsz=16281, num_updates=13000, lr=0.0005547, gnorm=0.237, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12762 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008 | valid on 'valid' subset | loss 4.341 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.341 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1296 / 1689 loss=4.356, nll_loss=2.755, ppl=6.75, wps=380023, ups=0.87, wpb=434724, bsz=16807.8, num_updates=13100, lr=0.000552579, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=12877 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1397 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=452511, ups=1.05, wpb=432510, bsz=16711.8, num_updates=13200, lr=0.000550482, gnorm=0.249, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=12972 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1497 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457452, ups=1.05, wpb=434190, bsz=16438.8, num_updates=13300, lr=0.000548408, gnorm=0.248, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13067 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 epoch 008: 1597 / 1689 loss=4.347, nll_loss=2.745, ppl=6.7, wps=456737, ups=1.06, wpb=431910, bsz=16620.6, num_updates=13400, lr=0.000546358, gnorm=0.257, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=13162 end of epoch 8 (average epoch stats below) epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 epoch 008 | loss 4.353 | nll_loss 2.751 | ppl 6.73 | wps 447870 | ups 1.03 | wpb 433514 | bsz 16506.9 | num_updates 13492 | lr 0.000544492 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.1 | wall 13248 Start iterating over samples epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 8 / 1689 loss=4.344, nll_loss=2.742, ppl=6.69, wps=457954, ups=1.06, wpb=431211, bsz=16495.1, num_updates=13500, lr=0.000544331, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13256 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 108 / 1689 loss=4.307, nll_loss=2.698, ppl=6.49, wps=456503, ups=1.06, wpb=431511, bsz=16519.9, num_updates=13600, lr=0.000542326, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=13350 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 208 / 1689 loss=4.321, nll_loss=2.715, ppl=6.57, wps=459477, ups=1.06, wpb=433926, bsz=16727.1, num_updates=13700, lr=0.000540343, gnorm=0.25, clip=0, loss_scale=4, train_wall=93, gb_free=17.5, wall=13445 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 308 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=458756, ups=1.05, wpb=435021, bsz=16413.4, num_updates=13800, lr=0.000538382, gnorm=0.244, clip=0, loss_scale=4, train_wall=93, gb_free=15.7, wall=13540 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 408 / 1689 loss=4.318, nll_loss=2.712, ppl=6.55, wps=459598, ups=1.06, wpb=435031, bsz=16444.2, num_updates=13900, lr=0.000536442, gnorm=0.241, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=13634 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 epoch 009: 509 / 1689 loss=4.324, nll_loss=2.719, ppl=6.59, wps=450769, ups=1.04, wpb=432730, bsz=16541.3, num_updates=14000, lr=0.000534522, gnorm=0.252, clip=0, loss_scale=2, train_wall=95, gb_free=18.7, wall=13730 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009 | valid on 'valid' subset | loss 4.336 | nll_loss 2.704 | ppl 6.51 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.336 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 609 / 1689 loss=4.33, nll_loss=2.726, ppl=6.62, wps=355944, ups=0.82, wpb=433733, bsz=16229.4, num_updates=14100, lr=0.000532624, gnorm=0.25, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=13852 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 709 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462286, ups=1.06, wpb=435588, bsz=16483, num_updates=14200, lr=0.000530745, gnorm=0.249, clip=1, loss_scale=2, train_wall=93, gb_free=18.9, wall=13946 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 810 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457317, ups=1.06, wpb=432789, bsz=16695.9, num_updates=14300, lr=0.000528886, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14041 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 910 / 1689 loss=4.321, nll_loss=2.716, ppl=6.57, wps=461997, ups=1.07, wpb=433743, bsz=16661.7, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=14135 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1010 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=456506, ups=1.05, wpb=432747, bsz=16322.3, num_updates=14500, lr=0.000525226, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=14230 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1110 / 1689 loss=4.33, nll_loss=2.727, ppl=6.62, wps=462246, ups=1.06, wpb=434886, bsz=16668.3, num_updates=14600, lr=0.000523424, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14324 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1211 / 1689 loss=4.313, nll_loss=2.707, ppl=6.53, wps=456370, ups=1.06, wpb=432514, bsz=16382.9, num_updates=14700, lr=0.000521641, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=14418 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1311 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=458880, ups=1.06, wpb=431807, bsz=16274.1, num_updates=14800, lr=0.000519875, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=14513 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1411 / 1689 loss=4.325, nll_loss=2.721, ppl=6.59, wps=458393, ups=1.06, wpb=433995, bsz=16609.2, num_updates=14900, lr=0.000518128, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=14607 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 epoch 009: 1511 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=455692, ups=1.05, wpb=433210, bsz=16558.2, num_updates=15000, lr=0.000516398, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=14702 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009 | valid on 'valid' subset | loss 4.307 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.307 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 epoch 009: 1611 / 1689 loss=4.338, nll_loss=2.736, ppl=6.66, wps=374509, ups=0.86, wpb=434922, bsz=16618.9, num_updates=15100, lr=0.000514685, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=14818 end of epoch 9 (average epoch stats below) epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 epoch 009 | loss 4.323 | nll_loss 2.718 | ppl 6.58 | wps 444659 | ups 1.03 | wpb 433530 | bsz 16508.3 | num_updates 15178 | lr 0.000513361 | gnorm 0.242 | clip 0.1 | loss_scale 1 | train_wall 1572 | gb_free 20.2 | wall 14892 Start iterating over samples epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 22 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=455929, ups=1.06, wpb=431292, bsz=16655.2, num_updates=15200, lr=0.000512989, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=14913 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 122 / 1689 loss=4.297, nll_loss=2.688, ppl=6.44, wps=459062, ups=1.06, wpb=434455, bsz=16475.3, num_updates=15300, lr=0.00051131, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15008 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 222 / 1689 loss=4.288, nll_loss=2.678, ppl=6.4, wps=458231, ups=1.06, wpb=433180, bsz=16630.6, num_updates=15400, lr=0.000509647, gnorm=0.237, clip=1, loss_scale=1, train_wall=93, gb_free=14.4, wall=15102 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 322 / 1689 loss=4.294, nll_loss=2.686, ppl=6.43, wps=458336, ups=1.06, wpb=433834, bsz=16467.8, num_updates=15500, lr=0.000508001, gnorm=0.259, clip=1, loss_scale=1, train_wall=93, gb_free=17.5, wall=15197 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 422 / 1689 loss=4.294, nll_loss=2.686, ppl=6.44, wps=458448, ups=1.05, wpb=434816, bsz=16609, num_updates=15600, lr=0.00050637, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=15292 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 522 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457201, ups=1.06, wpb=432012, bsz=16335.3, num_updates=15700, lr=0.000504754, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=15386 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 622 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=453920, ups=1.05, wpb=432420, bsz=16812.8, num_updates=15800, lr=0.000503155, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=15482 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 722 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=454000, ups=1.05, wpb=433504, bsz=16486.1, num_updates=15900, lr=0.00050157, gnorm=0.229, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=15577 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 epoch 010: 822 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=457290, ups=1.05, wpb=435164, bsz=16320.6, num_updates=16000, lr=0.0005, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=15672 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010 | valid on 'valid' subset | loss 4.311 | nll_loss 2.673 | ppl 6.38 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.307 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 922 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=403080, ups=0.93, wpb=435586, bsz=16808.6, num_updates=16100, lr=0.000498445, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=15780 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1022 / 1689 loss=4.294, nll_loss=2.687, ppl=6.44, wps=456760, ups=1.05, wpb=433631, bsz=16653.5, num_updates=16200, lr=0.000496904, gnorm=0.234, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=15875 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1122 / 1689 loss=4.295, nll_loss=2.688, ppl=6.44, wps=458240, ups=1.06, wpb=432167, bsz=16451.7, num_updates=16300, lr=0.000495377, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=15969 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1223 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=455119, ups=1.05, wpb=433654, bsz=16504.5, num_updates=16400, lr=0.000493865, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16065 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1323 / 1689 loss=4.291, nll_loss=2.684, ppl=6.42, wps=456860, ups=1.05, wpb=433130, bsz=16684.6, num_updates=16500, lr=0.000492366, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=16160 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1423 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462487, ups=1.06, wpb=434465, bsz=16531.6, num_updates=16600, lr=0.000490881, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=16254 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1523 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=456603, ups=1.06, wpb=432590, bsz=16352.1, num_updates=16700, lr=0.000489409, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=16348 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 epoch 010: 1623 / 1689 loss=4.308, nll_loss=2.703, ppl=6.51, wps=454925, ups=1.05, wpb=434475, bsz=16252.8, num_updates=16800, lr=0.00048795, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=16444 end of epoch 10 (average epoch stats below) epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 epoch 010 | loss 4.299 | nll_loss 2.692 | ppl 6.46 | wps 453502 | ups 1.05 | wpb 433529 | bsz 16507.4 | num_updates 16866 | lr 0.000486994 | gnorm 0.236 | clip 0.1 | loss_scale 2 | train_wall 1573 | gb_free 21 | wall 16505 Start iterating over samples epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 34 / 1689 loss=4.293, nll_loss=2.686, ppl=6.44, wps=458750, ups=1.06, wpb=432445, bsz=16096.6, num_updates=16900, lr=0.000486504, gnorm=0.231, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=16538 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 epoch 011: 135 / 1689 loss=4.26, nll_loss=2.648, ppl=6.27, wps=453265, ups=1.04, wpb=434343, bsz=16647.9, num_updates=17000, lr=0.000485071, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=16634 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011 | valid on 'valid' subset | loss 4.303 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.303 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 235 / 1689 loss=4.267, nll_loss=2.656, ppl=6.3, wps=374981, ups=0.87, wpb=431014, bsz=16526.9, num_updates=17100, lr=0.000483651, gnorm=0.228, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=16749 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 335 / 1689 loss=4.273, nll_loss=2.663, ppl=6.33, wps=461641, ups=1.06, wpb=433476, bsz=16920.4, num_updates=17200, lr=0.000482243, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=16843 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 435 / 1689 loss=4.278, nll_loss=2.668, ppl=6.36, wps=456973, ups=1.05, wpb=434656, bsz=16296.7, num_updates=17300, lr=0.000480847, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=16938 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 535 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=460876, ups=1.06, wpb=435967, bsz=16600.1, num_updates=17400, lr=0.000479463, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17032 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 635 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=457080, ups=1.05, wpb=435531, bsz=16285.8, num_updates=17500, lr=0.000478091, gnorm=0.225, clip=0, loss_scale=4, train_wall=94, gb_free=19.5, wall=17128 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 735 / 1689 loss=4.288, nll_loss=2.68, ppl=6.41, wps=454429, ups=1.05, wpb=433472, bsz=16457.8, num_updates=17600, lr=0.000476731, gnorm=0.222, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=17223 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 835 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=452431, ups=1.04, wpb=435095, bsz=16602.3, num_updates=17700, lr=0.000475383, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=17319 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 936 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=450543, ups=1.05, wpb=430764, bsz=16511.8, num_updates=17800, lr=0.000474045, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=20.4, wall=17415 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1036 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459310, ups=1.06, wpb=434568, bsz=16754.8, num_updates=17900, lr=0.000472719, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=17510 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 epoch 011: 1136 / 1689 loss=4.275, nll_loss=2.666, ppl=6.35, wps=453245, ups=1.05, wpb=433144, bsz=16544, num_updates=18000, lr=0.000471405, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=17605 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011 | valid on 'valid' subset | loss 4.302 | nll_loss 2.662 | ppl 6.33 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.302 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1236 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=377181, ups=0.87, wpb=432760, bsz=16805.3, num_updates=18100, lr=0.0004701, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=17720 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1336 / 1689 loss=4.275, nll_loss=2.667, ppl=6.35, wps=465322, ups=1.07, wpb=433440, bsz=16388.2, num_updates=18200, lr=0.000468807, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=17813 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1437 / 1689 loss=4.285, nll_loss=2.677, ppl=6.4, wps=456808, ups=1.05, wpb=433821, bsz=16168.1, num_updates=18300, lr=0.000467525, gnorm=0.233, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17908 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1537 / 1689 loss=4.283, nll_loss=2.675, ppl=6.38, wps=463268, ups=1.07, wpb=434933, bsz=16108.7, num_updates=18400, lr=0.000466252, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18002 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 epoch 011: 1637 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=457473, ups=1.06, wpb=431314, bsz=16421, num_updates=18500, lr=0.000464991, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=18096 end of epoch 11 (average epoch stats below) epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 epoch 011 | loss 4.278 | nll_loss 2.669 | ppl 6.36 | wps 445723 | ups 1.03 | wpb 433550 | bsz 16505 | num_updates 18552 | lr 0.000464338 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 20.7 | wall 18145 Start iterating over samples epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 48 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=447619, ups=1.04, wpb=429596, bsz=16385.5, num_updates=18600, lr=0.000463739, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=18192 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 148 / 1689 loss=4.245, nll_loss=2.631, ppl=6.19, wps=460674, ups=1.06, wpb=432853, bsz=16147.7, num_updates=18700, lr=0.000462497, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=18286 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 248 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459010, ups=1.06, wpb=433124, bsz=16837, num_updates=18800, lr=0.000461266, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=18380 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 349 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=455738, ups=1.05, wpb=434091, bsz=16256.7, num_updates=18900, lr=0.000460044, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=18476 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 epoch 012: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=456787, ups=1.05, wpb=433767, bsz=16303, num_updates=19000, lr=0.000458831, gnorm=0.224, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18571 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.286 | nll_loss 2.647 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.286 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 549 / 1689 loss=4.257, nll_loss=2.645, ppl=6.26, wps=300320, ups=0.7, wpb=431560, bsz=16636.5, num_updates=19100, lr=0.000457629, gnorm=0.225, clip=0, loss_scale=2, train_wall=119, gb_free=19.2, wall=18714 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 649 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=458820, ups=1.06, wpb=433829, bsz=16610.4, num_updates=19200, lr=0.000456435, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18809 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 749 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=454603, ups=1.05, wpb=433140, bsz=16639.4, num_updates=19300, lr=0.000455251, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=18904 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 849 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=454581, ups=1.05, wpb=433424, bsz=16803.6, num_updates=19400, lr=0.000454077, gnorm=0.235, clip=0, loss_scale=4, train_wall=94, gb_free=20.5, wall=19000 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 950 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=453764, ups=1.04, wpb=437661, bsz=16651.2, num_updates=19500, lr=0.000452911, gnorm=0.221, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=19096 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1050 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=456117, ups=1.05, wpb=433369, bsz=16096.4, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=19191 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1150 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=457152, ups=1.05, wpb=433748, bsz=16579, num_updates=19700, lr=0.000450606, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=19286 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1250 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=457528, ups=1.05, wpb=434280, bsz=16588, num_updates=19800, lr=0.000449467, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=19381 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1350 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=461446, ups=1.06, wpb=435210, bsz=16933.2, num_updates=19900, lr=0.000448336, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=19475 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 epoch 012: 1450 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=461431, ups=1.06, wpb=434987, bsz=16310.7, num_updates=20000, lr=0.000447214, gnorm=0.226, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=19569 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.652 | ppl 6.29 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.286 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1550 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=378116, ups=0.87, wpb=434188, bsz=16408.8, num_updates=20100, lr=0.0004461, gnorm=0.213, clip=0, loss_scale=4, train_wall=99, gb_free=18.8, wall=19684 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 epoch 012: 1651 / 1689 loss=4.268, nll_loss=2.659, ppl=6.31, wps=451479, ups=1.04, wpb=432817, bsz=16658.1, num_updates=20200, lr=0.000444994, gnorm=0.215, clip=0, loss_scale=2, train_wall=95, gb_free=18.2, wall=19780 end of epoch 12 (average epoch stats below) epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 epoch 012 | loss 4.26 | nll_loss 2.649 | ppl 6.27 | wps 437793 | ups 1.01 | wpb 433534 | bsz 16507.2 | num_updates 20238 | lr 0.000444576 | gnorm 0.222 | clip 0 | loss_scale 2 | train_wall 1606 | gb_free 20.6 | wall 19815 Start iterating over samples epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 62 / 1689 loss=4.238, nll_loss=2.624, ppl=6.17, wps=457906, ups=1.07, wpb=429420, bsz=16086.7, num_updates=20300, lr=0.000443897, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=19874 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 162 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=461907, ups=1.06, wpb=434348, bsz=16450, num_updates=20400, lr=0.000442807, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=19968 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 262 / 1689 loss=4.238, nll_loss=2.624, ppl=6.16, wps=460426, ups=1.06, wpb=435121, bsz=16381.8, num_updates=20500, lr=0.000441726, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=20062 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 362 / 1689 loss=4.23, nll_loss=2.616, ppl=6.13, wps=459259, ups=1.06, wpb=433348, bsz=16768.9, num_updates=20600, lr=0.000440653, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=20157 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 462 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=456586, ups=1.06, wpb=432368, bsz=16570, num_updates=20700, lr=0.000439587, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=20251 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 562 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=459406, ups=1.06, wpb=433033, bsz=16171.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=4, train_wall=93, gb_free=16.5, wall=20346 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 662 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459096, ups=1.05, wpb=435329, bsz=17005.9, num_updates=20900, lr=0.000437479, gnorm=0.223, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=20441 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 epoch 013: 763 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456426, ups=1.05, wpb=433183, bsz=16521, num_updates=21000, lr=0.000436436, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=20535 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013 | valid on 'valid' subset | loss 4.283 | nll_loss 2.646 | ppl 6.26 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.283 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 863 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=381628, ups=0.88, wpb=435653, bsz=16672.4, num_updates=21100, lr=0.0004354, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=20650 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 963 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=456862, ups=1.06, wpb=432131, bsz=16423.2, num_updates=21200, lr=0.000434372, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=20744 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1063 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=455465, ups=1.05, wpb=434918, bsz=16444.2, num_updates=21300, lr=0.000433351, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=20840 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1163 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=459467, ups=1.06, wpb=433792, bsz=16407.2, num_updates=21400, lr=0.000432338, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=20934 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1263 / 1689 loss=4.253, nll_loss=2.642, ppl=6.24, wps=458215, ups=1.06, wpb=433941, bsz=16348.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=4, train_wall=93, gb_free=18, wall=21029 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1363 / 1689 loss=4.257, nll_loss=2.648, ppl=6.27, wps=454771, ups=1.05, wpb=433473, bsz=16801.4, num_updates=21600, lr=0.000430331, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=21124 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1463 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=457612, ups=1.06, wpb=432907, bsz=16547.9, num_updates=21700, lr=0.000429339, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=21219 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1563 / 1689 loss=4.254, nll_loss=2.644, ppl=6.25, wps=454750, ups=1.05, wpb=432643, bsz=16647.3, num_updates=21800, lr=0.000428353, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=21314 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 epoch 013: 1663 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=456050, ups=1.05, wpb=433370, bsz=16254.4, num_updates=21900, lr=0.000427374, gnorm=0.226, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=21409 end of epoch 13 (average epoch stats below) epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 epoch 013 | loss 4.245 | nll_loss 2.632 | ppl 6.2 | wps 452296 | ups 1.04 | wpb 433532 | bsz 16506.3 | num_updates 21926 | lr 0.00042712 | gnorm 0.22 | clip 0 | loss_scale 4 | train_wall 1569 | gb_free 19.7 | wall 21433 Start iterating over samples epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 epoch 014: 74 / 1689 loss=4.223, nll_loss=2.607, ppl=6.09, wps=449723, ups=1.05, wpb=428567, bsz=16373, num_updates=22000, lr=0.000426401, gnorm=0.222, clip=0, loss_scale=8, train_wall=93, gb_free=21.3, wall=21504 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014 | valid on 'valid' subset | loss 4.276 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.276 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 175 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=378950, ups=0.87, wpb=434977, bsz=16544.7, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21619 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 275 / 1689 loss=4.218, nll_loss=2.602, ppl=6.07, wps=464410, ups=1.07, wpb=432830, bsz=16406.5, num_updates=22200, lr=0.000424476, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=21712 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 375 / 1689 loss=4.225, nll_loss=2.61, ppl=6.1, wps=462279, ups=1.07, wpb=432559, bsz=16750.9, num_updates=22300, lr=0.000423524, gnorm=0.222, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21806 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 475 / 1689 loss=4.224, nll_loss=2.609, ppl=6.1, wps=464810, ups=1.06, wpb=436903, bsz=16370, num_updates=22400, lr=0.000422577, gnorm=0.224, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=21900 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 576 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=456410, ups=1.05, wpb=434663, bsz=16538.6, num_updates=22500, lr=0.000421637, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=21995 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 676 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=458347, ups=1.06, wpb=432390, bsz=16334.8, num_updates=22600, lr=0.000420703, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=22089 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 776 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458918, ups=1.05, wpb=435615, bsz=16418.1, num_updates=22700, lr=0.000419775, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=22184 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 876 / 1689 loss=4.235, nll_loss=2.622, ppl=6.15, wps=460583, ups=1.06, wpb=435044, bsz=16613.4, num_updates=22800, lr=0.000418854, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22279 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 976 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=451738, ups=1.05, wpb=429382, bsz=16842.6, num_updates=22900, lr=0.000417938, gnorm=0.213, clip=0, loss_scale=2, train_wall=94, gb_free=17.9, wall=22374 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 epoch 014: 1077 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=451814, ups=1.04, wpb=435576, bsz=16595, num_updates=23000, lr=0.000417029, gnorm=0.207, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=22470 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014 | valid on 'valid' subset | loss 4.267 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.267 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1177 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=377981, ups=0.88, wpb=431376, bsz=16464.5, num_updates=23100, lr=0.000416125, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=22584 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1278 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=455884, ups=1.05, wpb=434322, bsz=16423.3, num_updates=23200, lr=0.000415227, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=22680 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1378 / 1689 loss=4.249, nll_loss=2.638, ppl=6.22, wps=460760, ups=1.06, wpb=434974, bsz=16327.5, num_updates=23300, lr=0.000414335, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=22774 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1478 / 1689 loss=4.241, nll_loss=2.63, ppl=6.19, wps=457176, ups=1.05, wpb=434833, bsz=16593.2, num_updates=23400, lr=0.000413449, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=22869 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1579 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=452693, ups=1.04, wpb=433522, bsz=16502.2, num_updates=23500, lr=0.000412568, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=22965 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 epoch 014: 1679 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=457626, ups=1.06, wpb=432255, bsz=16420.2, num_updates=23600, lr=0.000411693, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23059 end of epoch 14 (average epoch stats below) epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 epoch 014 | loss 4.231 | nll_loss 2.617 | ppl 6.14 | wps 446460 | ups 1.03 | wpb 433511 | bsz 16496.7 | num_updates 23610 | lr 0.000411606 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 21.6 | wall 23068 Start iterating over samples epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 90 / 1689 loss=4.193, nll_loss=2.574, ppl=5.96, wps=451801, ups=1.05, wpb=430722, bsz=16165, num_updates=23700, lr=0.000410824, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=23155 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 190 / 1689 loss=4.201, nll_loss=2.584, ppl=5.99, wps=455186, ups=1.05, wpb=434427, bsz=16532.2, num_updates=23800, lr=0.00040996, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.5, wall=23250 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 290 / 1689 loss=4.212, nll_loss=2.595, ppl=6.04, wps=455277, ups=1.05, wpb=433989, bsz=16520.1, num_updates=23900, lr=0.000409101, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.6, wall=23345 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 epoch 015: 390 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=457104, ups=1.05, wpb=434562, bsz=16841.8, num_updates=24000, lr=0.000408248, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=23441 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015 | valid on 'valid' subset | loss 4.272 | nll_loss 2.638 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.267 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 490 / 1689 loss=4.218, nll_loss=2.603, ppl=6.08, wps=408117, ups=0.94, wpb=435482, bsz=16724.5, num_updates=24100, lr=0.0004074, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=23547 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 590 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=458615, ups=1.06, wpb=431449, bsz=16402.7, num_updates=24200, lr=0.000406558, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=23641 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 690 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457933, ups=1.05, wpb=434274, bsz=16732.1, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=23736 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 790 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=460138, ups=1.07, wpb=430993, bsz=16237.9, num_updates=24400, lr=0.000404888, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=23830 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 890 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=459590, ups=1.06, wpb=435509, bsz=16257.5, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=23925 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 990 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=461928, ups=1.07, wpb=432478, bsz=16219.4, num_updates=24600, lr=0.000403239, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=21.3, wall=24018 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1090 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459222, ups=1.06, wpb=433268, bsz=16825.7, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=24113 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1190 / 1689 loss=4.23, nll_loss=2.617, ppl=6.13, wps=462905, ups=1.07, wpb=434483, bsz=16295.5, num_updates=24800, lr=0.00040161, gnorm=0.21, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=24206 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1290 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=453058, ups=1.05, wpb=431412, bsz=16796.7, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=24302 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 epoch 015: 1390 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=456266, ups=1.05, wpb=433499, bsz=16639.9, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=24397 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015 | valid on 'valid' subset | loss 4.259 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.259 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1491 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=291648, ups=0.67, wpb=434206, bsz=16704.2, num_updates=25100, lr=0.000399202, gnorm=0.21, clip=0, loss_scale=2, train_wall=122, gb_free=19.8, wall=24546 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 epoch 015: 1591 / 1689 loss=4.227, nll_loss=2.614, ppl=6.12, wps=458520, ups=1.05, wpb=435525, bsz=16515, num_updates=25200, lr=0.00039841, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=24641 end of epoch 15 (average epoch stats below) epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 epoch 015 | loss 4.218 | nll_loss 2.604 | ppl 6.08 | wps 439711 | ups 1.01 | wpb 433538 | bsz 16505.5 | num_updates 25298 | lr 0.000397637 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1603 | gb_free 19.4 | wall 24732 Start iterating over samples epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 2 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456645, ups=1.06, wpb=430481, bsz=16016.4, num_updates=25300, lr=0.000397621, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=24735 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 102 / 1689 loss=4.191, nll_loss=2.572, ppl=5.95, wps=457590, ups=1.06, wpb=431280, bsz=16469.1, num_updates=25400, lr=0.000396838, gnorm=0.206, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=24829 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 202 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=455830, ups=1.05, wpb=434175, bsz=16728.7, num_updates=25500, lr=0.000396059, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24924 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 302 / 1689 loss=4.197, nll_loss=2.579, ppl=5.98, wps=457505, ups=1.05, wpb=435672, bsz=16132.8, num_updates=25600, lr=0.000395285, gnorm=0.203, clip=0, loss_scale=4, train_wall=94, gb_free=20.4, wall=25020 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 402 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=457284, ups=1.05, wpb=435652, bsz=16225.6, num_updates=25700, lr=0.000394515, gnorm=0.221, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=25115 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 504 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=450547, ups=1.04, wpb=433330, bsz=16150.1, num_updates=25800, lr=0.00039375, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=20, wall=25211 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 605 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=451677, ups=1.04, wpb=433738, bsz=16467.1, num_updates=25900, lr=0.000392989, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=95, gb_free=19, wall=25307 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 epoch 016: 705 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=459942, ups=1.06, wpb=433875, bsz=16566.6, num_updates=26000, lr=0.000392232, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=25401 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016 | valid on 'valid' subset | loss 4.261 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.259 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 805 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=409025, ups=0.95, wpb=432204, bsz=16582, num_updates=26100, lr=0.00039148, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=25507 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 905 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462204, ups=1.06, wpb=434854, bsz=16724.9, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=25601 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1005 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=460600, ups=1.06, wpb=433474, bsz=16329.5, num_updates=26300, lr=0.000389989, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=25695 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1105 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=459404, ups=1.06, wpb=433713, bsz=16550.6, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=25790 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1205 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=452979, ups=1.04, wpb=434008, bsz=17063.8, num_updates=26500, lr=0.000388514, gnorm=0.209, clip=0, loss_scale=1, train_wall=95, gb_free=19.2, wall=25885 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1305 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=452184, ups=1.04, wpb=433199, bsz=16673.5, num_updates=26600, lr=0.000387783, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=20.1, wall=25981 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1405 / 1689 loss=4.202, nll_loss=2.586, ppl=6.01, wps=458516, ups=1.06, wpb=433457, bsz=16571.5, num_updates=26700, lr=0.000387056, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26076 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1505 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=456280, ups=1.06, wpb=432310, bsz=16514.8, num_updates=26800, lr=0.000386334, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=26171 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 epoch 016: 1605 / 1689 loss=4.219, nll_loss=2.605, ppl=6.09, wps=455918, ups=1.05, wpb=432759, bsz=16387.8, num_updates=26900, lr=0.000385615, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=26265 end of epoch 16 (average epoch stats below) epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 epoch 016 | loss 4.207 | nll_loss 2.591 | ppl 6.03 | wps 453311 | ups 1.05 | wpb 433514 | bsz 16511.4 | num_updates 26984 | lr 0.000385014 | gnorm 0.211 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 19.9 | wall 26345 Start iterating over samples epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 epoch 017: 16 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=455696, ups=1.05, wpb=432535, bsz=16583.3, num_updates=27000, lr=0.0003849, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=26360 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.257 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 116 / 1689 loss=4.181, nll_loss=2.561, ppl=5.9, wps=379378, ups=0.87, wpb=436329, bsz=16672.5, num_updates=27100, lr=0.000384189, gnorm=0.205, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=26475 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 216 / 1689 loss=4.179, nll_loss=2.558, ppl=5.89, wps=451207, ups=1.04, wpb=432622, bsz=16370.3, num_updates=27200, lr=0.000383482, gnorm=0.22, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=26571 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 316 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=454190, ups=1.05, wpb=432723, bsz=16593, num_updates=27300, lr=0.00038278, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=26667 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 416 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=460248, ups=1.06, wpb=434883, bsz=16559.2, num_updates=27400, lr=0.00038208, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=26761 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 517 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=451989, ups=1.04, wpb=435007, bsz=16352.2, num_updates=27500, lr=0.000381385, gnorm=0.196, clip=0, loss_scale=2, train_wall=95, gb_free=18.8, wall=26857 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 617 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=454897, ups=1.05, wpb=432503, bsz=16217.4, num_updates=27600, lr=0.000380693, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=26952 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 717 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=458245, ups=1.06, wpb=432621, bsz=16375.8, num_updates=27700, lr=0.000380006, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=27047 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 817 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=455687, ups=1.05, wpb=433439, bsz=17083.8, num_updates=27800, lr=0.000379322, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=27142 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 917 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=457356, ups=1.05, wpb=433861, bsz=16741.4, num_updates=27900, lr=0.000378641, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=27237 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 epoch 017: 1017 / 1689 loss=4.198, nll_loss=2.581, ppl=5.98, wps=455326, ups=1.05, wpb=433419, bsz=16280.1, num_updates=28000, lr=0.000377964, gnorm=0.215, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=27332 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.256 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1117 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=377338, ups=0.87, wpb=435452, bsz=16677.4, num_updates=28100, lr=0.000377291, gnorm=0.212, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=27447 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1217 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=463336, ups=1.07, wpb=433772, bsz=16294.3, num_updates=28200, lr=0.000376622, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=27541 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1317 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=462598, ups=1.07, wpb=432144, bsz=16305.1, num_updates=28300, lr=0.000375956, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=27634 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1418 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=452588, ups=1.05, wpb=432775, bsz=16502.1, num_updates=28400, lr=0.000375293, gnorm=0.2, clip=0, loss_scale=2, train_wall=95, gb_free=20.3, wall=27730 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1518 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=457970, ups=1.06, wpb=432057, bsz=16726.1, num_updates=28500, lr=0.000374634, gnorm=0.214, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=27824 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 epoch 017: 1618 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=465389, ups=1.07, wpb=435543, bsz=16712.3, num_updates=28600, lr=0.000373979, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27918 end of epoch 17 (average epoch stats below) epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 epoch 017 | loss 4.197 | nll_loss 2.58 | ppl 5.98 | wps 446150 | ups 1.03 | wpb 433538 | bsz 16507.1 | num_updates 28671 | lr 0.000373515 | gnorm 0.21 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 27984 Start iterating over samples epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 29 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=455301, ups=1.06, wpb=431046, bsz=16063, num_updates=28700, lr=0.000373327, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28013 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 129 / 1689 loss=4.174, nll_loss=2.553, ppl=5.87, wps=459632, ups=1.06, wpb=434472, bsz=16444.7, num_updates=28800, lr=0.000372678, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28107 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 229 / 1689 loss=4.186, nll_loss=2.567, ppl=5.92, wps=461214, ups=1.06, wpb=437083, bsz=16431.3, num_updates=28900, lr=0.000372033, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=28202 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 epoch 018: 331 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=451106, ups=1.04, wpb=433680, bsz=16546.6, num_updates=29000, lr=0.000371391, gnorm=0.212, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=28298 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018 | valid on 'valid' subset | loss 4.255 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.255 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 431 / 1689 loss=4.184, nll_loss=2.565, ppl=5.92, wps=381060, ups=0.88, wpb=433128, bsz=16709.8, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=28412 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 531 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=457374, ups=1.06, wpb=431879, bsz=16439.9, num_updates=29200, lr=0.000370117, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=28506 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 631 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=456847, ups=1.05, wpb=433575, bsz=16305.6, num_updates=29300, lr=0.000369484, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=28601 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 731 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=458065, ups=1.05, wpb=434216, bsz=16557.8, num_updates=29400, lr=0.000368856, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=28696 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 831 / 1689 loss=4.196, nll_loss=2.579, ppl=5.98, wps=456820, ups=1.05, wpb=434812, bsz=17007.8, num_updates=29500, lr=0.00036823, gnorm=0.216, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=28791 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 931 / 1689 loss=4.193, nll_loss=2.575, ppl=5.96, wps=459073, ups=1.06, wpb=433612, bsz=16207.5, num_updates=29600, lr=0.000367607, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=28886 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1031 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=455926, ups=1.05, wpb=434266, bsz=16319, num_updates=29700, lr=0.000366988, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=28981 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1131 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=458654, ups=1.06, wpb=434658, bsz=16459.4, num_updates=29800, lr=0.000366372, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=29076 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1231 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=453008, ups=1.05, wpb=432759, bsz=16652.5, num_updates=29900, lr=0.000365758, gnorm=0.211, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=29171 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 epoch 018: 1332 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=453024, ups=1.04, wpb=435967, bsz=16292.7, num_updates=30000, lr=0.000365148, gnorm=0.202, clip=0, loss_scale=2, train_wall=95, gb_free=20, wall=29267 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018 | valid on 'valid' subset | loss 4.247 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.247 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1432 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=104560, ups=0.24, wpb=434271, bsz=16409.9, num_updates=30100, lr=0.000364541, gnorm=0.223, clip=0, loss_scale=2, train_wall=352, gb_free=19.3, wall=29683 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1532 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=465465, ups=1.08, wpb=431206, bsz=16384.1, num_updates=30200, lr=0.000363937, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=29775 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 epoch 018: 1632 / 1689 loss=4.185, nll_loss=2.568, ppl=5.93, wps=461026, ups=1.07, wpb=430364, bsz=16867.8, num_updates=30300, lr=0.000363336, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29869 end of epoch 18 (average epoch stats below) epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 epoch 018 | loss 4.188 | nll_loss 2.57 | ppl 5.94 | wps 377238 | ups 0.87 | wpb 433540 | bsz 16507 | num_updates 30357 | lr 0.000362995 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1830 | gb_free 21.1 | wall 29922 Start iterating over samples epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 43 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=458677, ups=1.06, wpb=430928, bsz=16599.5, num_updates=30400, lr=0.000362738, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=29963 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 143 / 1689 loss=4.15, nll_loss=2.527, ppl=5.76, wps=459231, ups=1.06, wpb=432596, bsz=16986.9, num_updates=30500, lr=0.000362143, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=30057 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 243 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=459693, ups=1.06, wpb=431855, bsz=16418.6, num_updates=30600, lr=0.000361551, gnorm=0.198, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=30151 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 343 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459068, ups=1.06, wpb=433238, bsz=16726.2, num_updates=30700, lr=0.000360961, gnorm=0.221, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=30245 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 443 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=460108, ups=1.06, wpb=434409, bsz=16229.5, num_updates=30800, lr=0.000360375, gnorm=0.2, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=30340 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 543 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=459410, ups=1.06, wpb=434534, bsz=16489, num_updates=30900, lr=0.000359791, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=30434 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 epoch 019: 645 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=447682, ups=1.03, wpb=433206, bsz=16884.6, num_updates=31000, lr=0.000359211, gnorm=0.204, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=30531 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.261 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.247 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 745 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=406912, ups=0.94, wpb=432391, bsz=16777.7, num_updates=31100, lr=0.000358633, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30637 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 845 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=466898, ups=1.07, wpb=435752, bsz=16395.4, num_updates=31200, lr=0.000358057, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30730 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 945 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=458052, ups=1.06, wpb=433696, bsz=16303.5, num_updates=31300, lr=0.000357485, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30825 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1045 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=456479, ups=1.05, wpb=432928, bsz=16353.5, num_updates=31400, lr=0.000356915, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=30920 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1145 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=459587, ups=1.06, wpb=435258, bsz=16303.9, num_updates=31500, lr=0.000356348, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=31015 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1245 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=457114, ups=1.05, wpb=434246, bsz=16727.6, num_updates=31600, lr=0.000355784, gnorm=0.215, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=31110 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1345 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=454833, ups=1.04, wpb=435376, bsz=16706.3, num_updates=31700, lr=0.000355222, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=31205 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1445 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=455753, ups=1.05, wpb=432190, bsz=16238.4, num_updates=31800, lr=0.000354663, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=31300 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1545 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=456760, ups=1.05, wpb=434209, bsz=16200.5, num_updates=31900, lr=0.000354107, gnorm=0.198, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=31395 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 epoch 019: 1645 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=458767, ups=1.06, wpb=433356, bsz=16634.7, num_updates=32000, lr=0.000353553, gnorm=0.19, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=31490 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 epoch 019 | valid on 'valid' subset | loss 4.248 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.247 end of epoch 19 (average epoch stats below) epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 epoch 019 | loss 4.179 | nll_loss 2.561 | ppl 5.9 | wps 451324 | ups 1.04 | wpb 433535 | bsz 16504.8 | num_updates 32044 | lr 0.000353311 | gnorm 0.203 | clip 0 | loss_scale 4 | train_wall 1574 | gb_free 20.3 | wall 31542 Start iterating over samples epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 56 / 1689 loss=4.165, nll_loss=2.545, ppl=5.83, wps=405512, ups=0.94, wpb=429434, bsz=16220.7, num_updates=32100, lr=0.000353002, gnorm=0.208, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=31596 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 157 / 1689 loss=4.161, nll_loss=2.539, ppl=5.81, wps=457520, ups=1.06, wpb=433278, bsz=16227.6, num_updates=32200, lr=0.000352454, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=31690 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 257 / 1689 loss=4.162, nll_loss=2.54, ppl=5.82, wps=461833, ups=1.07, wpb=433417, bsz=16415.3, num_updates=32300, lr=0.000351908, gnorm=0.202, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31784 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 357 / 1689 loss=4.154, nll_loss=2.532, ppl=5.78, wps=455970, ups=1.05, wpb=433653, bsz=16595, num_updates=32400, lr=0.000351364, gnorm=0.209, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=31879 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 457 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457758, ups=1.06, wpb=433536, bsz=16878.7, num_updates=32500, lr=0.000350823, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=31974 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 557 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458071, ups=1.05, wpb=435577, bsz=16743.7, num_updates=32600, lr=0.000350285, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32069 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 657 / 1689 loss=4.173, nll_loss=2.553, ppl=5.87, wps=454507, ups=1.05, wpb=434050, bsz=16379.8, num_updates=32700, lr=0.000349749, gnorm=0.216, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=32165 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 758 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=449998, ups=1.04, wpb=433872, bsz=16974.2, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32261 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 858 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=453807, ups=1.05, wpb=432903, bsz=16470.4, num_updates=32900, lr=0.000348684, gnorm=0.203, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=32356 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 epoch 020: 958 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=455033, ups=1.05, wpb=431831, bsz=16470.8, num_updates=33000, lr=0.000348155, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=32451 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020 | valid on 'valid' subset | loss 4.245 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.245 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1058 / 1689 loss=4.178, nll_loss=2.559, ppl=5.89, wps=311711, ups=0.72, wpb=433510, bsz=16163, num_updates=33100, lr=0.000347629, gnorm=0.199, clip=0, loss_scale=2, train_wall=113, gb_free=18.8, wall=32590 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1158 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=460464, ups=1.06, wpb=434921, bsz=16558.2, num_updates=33200, lr=0.000347105, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=32685 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1259 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453493, ups=1.04, wpb=435770, bsz=16600.3, num_updates=33300, lr=0.000346583, gnorm=0.202, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=32781 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1359 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=459358, ups=1.06, wpb=432659, bsz=16348.6, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=32875 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1459 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=455223, ups=1.05, wpb=432843, bsz=16489.7, num_updates=33500, lr=0.000345547, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32970 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1559 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=454537, ups=1.05, wpb=431911, bsz=16315.8, num_updates=33600, lr=0.000345033, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=33065 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 epoch 020: 1659 / 1689 loss=4.184, nll_loss=2.567, ppl=5.93, wps=460649, ups=1.06, wpb=435918, bsz=16596.6, num_updates=33700, lr=0.00034452, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=33160 end of epoch 20 (average epoch stats below) epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 epoch 020 | loss 4.171 | nll_loss 2.552 | ppl 5.86 | wps 444159 | ups 1.02 | wpb 433515 | bsz 16502.6 | num_updates 33730 | lr 0.000344367 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 19.6 | wall 33188 Start iterating over samples epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 70 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=452387, ups=1.05, wpb=429484, bsz=16260.6, num_updates=33800, lr=0.00034401, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=18.2, wall=33255 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 171 / 1689 loss=4.148, nll_loss=2.525, ppl=5.76, wps=454313, ups=1.05, wpb=432578, bsz=16515.4, num_updates=33900, lr=0.000343503, gnorm=0.214, clip=0, loss_scale=2, train_wall=95, gb_free=19.3, wall=33350 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 epoch 021: 271 / 1689 loss=4.153, nll_loss=2.53, ppl=5.78, wps=457558, ups=1.06, wpb=432532, bsz=16554.3, num_updates=34000, lr=0.000342997, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=33445 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.604 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 371 / 1689 loss=4.157, nll_loss=2.535, ppl=5.8, wps=381810, ups=0.88, wpb=433022, bsz=16786.1, num_updates=34100, lr=0.000342494, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33558 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 471 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=463964, ups=1.07, wpb=432162, bsz=16738.3, num_updates=34200, lr=0.000341993, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=33651 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 571 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=466597, ups=1.07, wpb=437285, bsz=16465.4, num_updates=34300, lr=0.000341494, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=33745 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 672 / 1689 loss=4.164, nll_loss=2.543, ppl=5.83, wps=454492, ups=1.05, wpb=432392, bsz=16366.2, num_updates=34400, lr=0.000340997, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=33840 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 772 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=455107, ups=1.05, wpb=433387, bsz=16617.7, num_updates=34500, lr=0.000340503, gnorm=0.21, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=33935 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 872 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=459321, ups=1.06, wpb=433284, bsz=16319.7, num_updates=34600, lr=0.00034001, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=34030 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 972 / 1689 loss=4.167, nll_loss=2.547, ppl=5.84, wps=461631, ups=1.07, wpb=433098, bsz=16375.6, num_updates=34700, lr=0.00033952, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.6, wall=34123 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1072 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=463038, ups=1.06, wpb=436432, bsz=16383.7, num_updates=34800, lr=0.000339032, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=34218 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1172 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=454828, ups=1.05, wpb=433084, bsz=16847.8, num_updates=34900, lr=0.000338546, gnorm=0.206, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=34313 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 epoch 021: 1274 / 1689 loss=4.175, nll_loss=2.557, ppl=5.88, wps=452434, ups=1.04, wpb=436159, bsz=16506.6, num_updates=35000, lr=0.000338062, gnorm=0.186, clip=0, loss_scale=1, train_wall=95, gb_free=19.3, wall=34409 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021 | valid on 'valid' subset | loss 4.236 | nll_loss 2.598 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.236 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1374 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=375951, ups=0.87, wpb=433531, bsz=16156.5, num_updates=35100, lr=0.00033758, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=34525 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1474 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=459433, ups=1.06, wpb=433081, bsz=16363.8, num_updates=35200, lr=0.0003371, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34619 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1574 / 1689 loss=4.166, nll_loss=2.547, ppl=5.84, wps=460267, ups=1.06, wpb=433901, bsz=16503.8, num_updates=35300, lr=0.000336622, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=34713 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 epoch 021: 1674 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=460544, ups=1.06, wpb=434130, bsz=16559, num_updates=35400, lr=0.000336146, gnorm=0.215, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=34807 end of epoch 21 (average epoch stats below) epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 epoch 021 | loss 4.164 | nll_loss 2.544 | ppl 5.83 | wps 447222 | ups 1.03 | wpb 433532 | bsz 16501.5 | num_updates 35415 | lr 0.000336075 | gnorm 0.202 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 20 | wall 34821 Start iterating over samples epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 85 / 1689 loss=4.138, nll_loss=2.513, ppl=5.71, wps=454431, ups=1.06, wpb=430581, bsz=16334.7, num_updates=35500, lr=0.000335673, gnorm=0.203, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=34902 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 185 / 1689 loss=4.139, nll_loss=2.515, ppl=5.72, wps=458285, ups=1.06, wpb=431346, bsz=16346.7, num_updates=35600, lr=0.000335201, gnorm=0.209, clip=0, loss_scale=2, train_wall=92, gb_free=16.9, wall=34996 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 285 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=458489, ups=1.06, wpb=433350, bsz=16607.8, num_updates=35700, lr=0.000334731, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35091 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 385 / 1689 loss=4.155, nll_loss=2.533, ppl=5.79, wps=459454, ups=1.06, wpb=434187, bsz=16586.4, num_updates=35800, lr=0.000334263, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35185 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 485 / 1689 loss=4.154, nll_loss=2.532, ppl=5.79, wps=457234, ups=1.05, wpb=433987, bsz=16532.9, num_updates=35900, lr=0.000333797, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=35280 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 epoch 022: 585 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=462561, ups=1.07, wpb=434319, bsz=16739.4, num_updates=36000, lr=0.000333333, gnorm=0.205, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=35374 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.238 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.236 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 685 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=410120, ups=0.94, wpb=435024, bsz=16501, num_updates=36100, lr=0.000332871, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=35480 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 786 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=455594, ups=1.05, wpb=433654, bsz=16167, num_updates=36200, lr=0.000332411, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=35575 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 886 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=459987, ups=1.06, wpb=434698, bsz=16747, num_updates=36300, lr=0.000331953, gnorm=0.204, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35670 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 987 / 1689 loss=4.164, nll_loss=2.544, ppl=5.83, wps=457555, ups=1.05, wpb=435151, bsz=16637.2, num_updates=36400, lr=0.000331497, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=35765 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1087 / 1689 loss=4.157, nll_loss=2.536, ppl=5.8, wps=458386, ups=1.05, wpb=435640, bsz=16548.4, num_updates=36500, lr=0.000331042, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35860 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1187 / 1689 loss=4.169, nll_loss=2.55, ppl=5.86, wps=457240, ups=1.06, wpb=433340, bsz=16355.9, num_updates=36600, lr=0.00033059, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=35955 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1287 / 1689 loss=4.16, nll_loss=2.539, ppl=5.81, wps=454194, ups=1.05, wpb=432256, bsz=16581.4, num_updates=36700, lr=0.000330139, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=36050 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1387 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=457036, ups=1.05, wpb=434487, bsz=16747.7, num_updates=36800, lr=0.00032969, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=36145 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1487 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=457679, ups=1.06, wpb=433628, bsz=16277.8, num_updates=36900, lr=0.000329243, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=36240 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 epoch 022: 1587 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=456771, ups=1.05, wpb=433069, bsz=16595.8, num_updates=37000, lr=0.000328798, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=36335 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022 | valid on 'valid' subset | loss 4.241 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.236 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 epoch 022: 1687 / 1689 loss=4.16, nll_loss=2.54, ppl=5.81, wps=400478, ups=0.93, wpb=432138, bsz=16494, num_updates=37100, lr=0.000328355, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=36443 end of epoch 22 (average epoch stats below) epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 epoch 022 | loss 4.157 | nll_loss 2.536 | ppl 5.8 | wps 450710 | ups 1.04 | wpb 433535 | bsz 16508.1 | num_updates 37102 | lr 0.000328346 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.8 | wall 36444 Start iterating over samples epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 99 / 1689 loss=4.14, nll_loss=2.516, ppl=5.72, wps=446577, ups=1.04, wpb=430754, bsz=16394.7, num_updates=37200, lr=0.000327913, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36539 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 199 / 1689 loss=4.145, nll_loss=2.522, ppl=5.74, wps=462002, ups=1.06, wpb=434008, bsz=17069.5, num_updates=37300, lr=0.000327473, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=36633 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 299 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=457532, ups=1.05, wpb=435224, bsz=16850.2, num_updates=37400, lr=0.000327035, gnorm=0.199, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=36728 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 399 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=461326, ups=1.06, wpb=433639, bsz=16597, num_updates=37500, lr=0.000326599, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=36822 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 499 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=453213, ups=1.04, wpb=433888, bsz=16861.4, num_updates=37600, lr=0.000326164, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=36918 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 599 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=455931, ups=1.05, wpb=433781, bsz=16156.8, num_updates=37700, lr=0.000325731, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=37013 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 699 / 1689 loss=4.147, nll_loss=2.525, ppl=5.76, wps=455734, ups=1.05, wpb=432586, bsz=16534.2, num_updates=37800, lr=0.0003253, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=37108 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 799 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454637, ups=1.05, wpb=432105, bsz=16513.8, num_updates=37900, lr=0.000324871, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37203 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 epoch 023: 899 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=459368, ups=1.06, wpb=435007, bsz=16790.2, num_updates=38000, lr=0.000324443, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=37298 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023 | valid on 'valid' subset | loss 4.229 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.229 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 999 / 1689 loss=4.151, nll_loss=2.53, ppl=5.77, wps=302764, ups=0.7, wpb=430958, bsz=16193.5, num_updates=38100, lr=0.000324017, gnorm=0.207, clip=0, loss_scale=2, train_wall=117, gb_free=20.4, wall=37440 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1100 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=456568, ups=1.05, wpb=435523, bsz=16286.3, num_updates=38200, lr=0.000323592, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=37535 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1200 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=460389, ups=1.06, wpb=433934, bsz=16217.4, num_updates=38300, lr=0.00032317, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=37630 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1300 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=458368, ups=1.05, wpb=434916, bsz=16432.3, num_updates=38400, lr=0.000322749, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=37725 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1400 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=453981, ups=1.05, wpb=433616, bsz=16567.7, num_updates=38500, lr=0.000322329, gnorm=0.189, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=37820 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1500 / 1689 loss=4.158, nll_loss=2.537, ppl=5.81, wps=455762, ups=1.05, wpb=433594, bsz=16488.2, num_updates=38600, lr=0.000321911, gnorm=0.194, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=37915 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 epoch 023: 1600 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=457359, ups=1.06, wpb=432618, bsz=16390.6, num_updates=38700, lr=0.000321495, gnorm=0.21, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=38010 end of epoch 23 (average epoch stats below) epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 epoch 023 | loss 4.151 | nll_loss 2.529 | ppl 5.77 | wps 443462 | ups 1.02 | wpb 433528 | bsz 16505.9 | num_updates 38789 | lr 0.000321126 | gnorm 0.2 | clip 0 | loss_scale 4 | train_wall 1598 | gb_free 22 | wall 38093 Start iterating over samples epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 12 / 1689 loss=4.159, nll_loss=2.538, ppl=5.81, wps=452549, ups=1.05, wpb=430208, bsz=16105.4, num_updates=38800, lr=0.000321081, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38105 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 112 / 1689 loss=4.136, nll_loss=2.512, ppl=5.7, wps=457588, ups=1.06, wpb=433500, bsz=16779.9, num_updates=38900, lr=0.000320668, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=38200 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 epoch 024: 212 / 1689 loss=4.131, nll_loss=2.506, ppl=5.68, wps=456239, ups=1.06, wpb=432272, bsz=16718.6, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38294 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.229 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 312 / 1689 loss=4.138, nll_loss=2.515, ppl=5.71, wps=404420, ups=0.93, wpb=433606, bsz=16245.4, num_updates=39100, lr=0.000319847, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38402 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 412 / 1689 loss=4.141, nll_loss=2.518, ppl=5.73, wps=461595, ups=1.06, wpb=435741, bsz=16420.9, num_updates=39200, lr=0.000319438, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38496 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 512 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=458888, ups=1.06, wpb=433060, bsz=16709, num_updates=39300, lr=0.000319032, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=38590 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 613 / 1689 loss=4.151, nll_loss=2.529, ppl=5.77, wps=454645, ups=1.04, wpb=435622, bsz=16297.4, num_updates=39400, lr=0.000318626, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=38686 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 713 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=459709, ups=1.06, wpb=434240, bsz=16266.2, num_updates=39500, lr=0.000318223, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=38781 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 813 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=464296, ups=1.07, wpb=433008, bsz=16326.2, num_updates=39600, lr=0.000317821, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=38874 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 913 / 1689 loss=4.155, nll_loss=2.534, ppl=5.79, wps=463395, ups=1.07, wpb=434263, bsz=16549.4, num_updates=39700, lr=0.00031742, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=38968 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1013 / 1689 loss=4.149, nll_loss=2.527, ppl=5.76, wps=465672, ups=1.07, wpb=433277, bsz=16502.4, num_updates=39800, lr=0.000317021, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=39061 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1113 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=459358, ups=1.06, wpb=433888, bsz=17074.2, num_updates=39900, lr=0.000316624, gnorm=0.21, clip=0, loss_scale=4, train_wall=94, gb_free=19.1, wall=39155 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 epoch 024: 1215 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=450458, ups=1.04, wpb=434839, bsz=16633.3, num_updates=40000, lr=0.000316228, gnorm=0.194, clip=0, loss_scale=1, train_wall=96, gb_free=16.9, wall=39252 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024 | valid on 'valid' subset | loss 4.228 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.228 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1315 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=114104, ups=0.26, wpb=431984, bsz=16493.4, num_updates=40100, lr=0.000315833, gnorm=0.194, clip=0, loss_scale=1, train_wall=139, gb_free=18.8, wall=39630 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1415 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464113, ups=1.07, wpb=433214, bsz=16179.9, num_updates=40200, lr=0.00031544, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=39724 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1515 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=464816, ups=1.07, wpb=433972, bsz=16471.4, num_updates=40300, lr=0.000315049, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39817 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 epoch 024: 1615 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462890, ups=1.07, wpb=433288, bsz=16574, num_updates=40400, lr=0.000314658, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=16.3, wall=39911 end of epoch 24 (average epoch stats below) epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 epoch 024 | loss 4.145 | nll_loss 2.523 | ppl 5.75 | wps 387210 | ups 0.89 | wpb 433540 | bsz 16499.5 | num_updates 40474 | lr 0.000314371 | gnorm 0.2 | clip 0 | loss_scale 2 | train_wall 1618 | gb_free 19.6 | wall 39980 Start iterating over samples epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 26 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=452338, ups=1.05, wpb=431704, bsz=16204.9, num_updates=40500, lr=0.00031427, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=40006 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 126 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=458056, ups=1.06, wpb=431900, bsz=16456.4, num_updates=40600, lr=0.000313882, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=40100 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 226 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=459076, ups=1.06, wpb=434721, bsz=16546.4, num_updates=40700, lr=0.000313497, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=40195 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 326 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=460646, ups=1.06, wpb=435523, bsz=16643.1, num_updates=40800, lr=0.000313112, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=40290 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 426 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=458966, ups=1.06, wpb=431541, bsz=16626.8, num_updates=40900, lr=0.000312729, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18, wall=40384 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 epoch 025: 526 / 1689 loss=4.132, nll_loss=2.508, ppl=5.69, wps=461632, ups=1.07, wpb=432016, bsz=16649.9, num_updates=41000, lr=0.000312348, gnorm=0.189, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=40477 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025 | valid on 'valid' subset | loss 4.231 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.228 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 627 / 1689 loss=4.144, nll_loss=2.521, ppl=5.74, wps=407560, ups=0.94, wpb=434056, bsz=16213.3, num_updates=41100, lr=0.000311967, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40584 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 727 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=454229, ups=1.05, wpb=434038, bsz=16858.3, num_updates=41200, lr=0.000311588, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=40679 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 827 / 1689 loss=4.14, nll_loss=2.517, ppl=5.72, wps=457976, ups=1.06, wpb=433830, bsz=16363.4, num_updates=41300, lr=0.000311211, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=40774 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 927 / 1689 loss=4.132, nll_loss=2.509, ppl=5.69, wps=455145, ups=1.05, wpb=432684, bsz=16772.3, num_updates=41400, lr=0.000310835, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=40869 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1027 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=458214, ups=1.05, wpb=434759, bsz=16368.8, num_updates=41500, lr=0.00031046, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=40964 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1127 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=458611, ups=1.06, wpb=432624, bsz=16232, num_updates=41600, lr=0.000310087, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=41058 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1228 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=454398, ups=1.04, wpb=434831, bsz=16350.3, num_updates=41700, lr=0.000309715, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=41154 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1328 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=457977, ups=1.05, wpb=435681, bsz=16699.1, num_updates=41800, lr=0.000309344, gnorm=0.191, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=41249 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1428 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=459329, ups=1.06, wpb=434410, bsz=16341.2, num_updates=41900, lr=0.000308975, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=41344 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 epoch 025: 1528 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=458385, ups=1.06, wpb=433830, bsz=16720.8, num_updates=42000, lr=0.000308607, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=41438 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025 | valid on 'valid' subset | loss 4.221 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.221 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 epoch 025: 1628 / 1689 loss=4.142, nll_loss=2.52, ppl=5.73, wps=378324, ups=0.88, wpb=432275, bsz=16311, num_updates=42100, lr=0.00030824, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=41553 end of epoch 25 (average epoch stats below) epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 epoch 025 | loss 4.139 | nll_loss 2.516 | ppl 5.72 | wps 448863 | ups 1.04 | wpb 433513 | bsz 16504.9 | num_updates 42161 | lr 0.000308017 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.1 | wall 41609 Start iterating over samples epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 39 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460045, ups=1.07, wpb=431143, bsz=16556.1, num_updates=42200, lr=0.000307875, gnorm=0.195, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=41646 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 139 / 1689 loss=4.121, nll_loss=2.495, ppl=5.64, wps=463588, ups=1.07, wpb=435010, bsz=16474.1, num_updates=42300, lr=0.00030751, gnorm=0.194, clip=0, loss_scale=4, train_wall=93, gb_free=20, wall=41740 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 240 / 1689 loss=4.129, nll_loss=2.504, ppl=5.67, wps=455105, ups=1.05, wpb=434296, bsz=16713.6, num_updates=42400, lr=0.000307148, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=41836 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 340 / 1689 loss=4.132, nll_loss=2.507, ppl=5.69, wps=456773, ups=1.05, wpb=435325, bsz=16605.9, num_updates=42500, lr=0.000306786, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=41931 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 440 / 1689 loss=4.135, nll_loss=2.511, ppl=5.7, wps=456360, ups=1.05, wpb=434092, bsz=16283.8, num_updates=42600, lr=0.000306426, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=42026 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 540 / 1689 loss=4.128, nll_loss=2.503, ppl=5.67, wps=456802, ups=1.06, wpb=431496, bsz=16663.7, num_updates=42700, lr=0.000306067, gnorm=0.202, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=42120 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 640 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458550, ups=1.06, wpb=433468, bsz=16352.1, num_updates=42800, lr=0.000305709, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=42215 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 740 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461756, ups=1.06, wpb=435117, bsz=16572.3, num_updates=42900, lr=0.000305352, gnorm=0.187, clip=0, loss_scale=4, train_wall=93, gb_free=17.7, wall=42309 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 epoch 026: 840 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=456344, ups=1.05, wpb=432781, bsz=16415.3, num_updates=43000, lr=0.000304997, gnorm=0.207, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=42404 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026 | valid on 'valid' subset | loss 4.23 | nll_loss 2.592 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.221 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 941 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=361350, ups=0.83, wpb=435687, bsz=16505.4, num_updates=43100, lr=0.000304643, gnorm=0.201, clip=0, loss_scale=2, train_wall=99, gb_free=18.5, wall=42525 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1041 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=458245, ups=1.06, wpb=431750, bsz=16312.1, num_updates=43200, lr=0.00030429, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=42619 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1142 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=454808, ups=1.05, wpb=432343, bsz=16981.4, num_updates=43300, lr=0.000303939, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=16.6, wall=42714 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1242 / 1689 loss=4.136, nll_loss=2.513, ppl=5.71, wps=462223, ups=1.07, wpb=432521, bsz=16289.7, num_updates=43400, lr=0.000303588, gnorm=0.199, clip=0, loss_scale=1, train_wall=92, gb_free=17.6, wall=42807 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1342 / 1689 loss=4.128, nll_loss=2.504, ppl=5.67, wps=459000, ups=1.06, wpb=433206, bsz=16427.7, num_updates=43500, lr=0.000303239, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=42902 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1442 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=459559, ups=1.06, wpb=435285, bsz=16739.8, num_updates=43600, lr=0.000302891, gnorm=0.193, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=42997 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1542 / 1689 loss=4.139, nll_loss=2.516, ppl=5.72, wps=461708, ups=1.07, wpb=432839, bsz=16364.9, num_updates=43700, lr=0.000302545, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=43090 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 epoch 026: 1642 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459447, ups=1.06, wpb=433432, bsz=16455.3, num_updates=43800, lr=0.000302199, gnorm=0.189, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=43185 end of epoch 26 (average epoch stats below) epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 epoch 026 | loss 4.134 | nll_loss 2.511 | ppl 5.7 | wps 451428 | ups 1.04 | wpb 433530 | bsz 16504.6 | num_updates 43847 | lr 0.000302037 | gnorm 0.197 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 18.9 | wall 43228 Start iterating over samples epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 53 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457258, ups=1.06, wpb=429827, bsz=16044.2, num_updates=43900, lr=0.000301855, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=43279 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 epoch 027: 153 / 1689 loss=4.123, nll_loss=2.497, ppl=5.65, wps=466496, ups=1.08, wpb=433886, bsz=16660.3, num_updates=44000, lr=0.000301511, gnorm=0.188, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=43372 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.223 | nll_loss 2.585 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.221 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 253 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=404546, ups=0.93, wpb=433753, bsz=16126.5, num_updates=44100, lr=0.000301169, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=43479 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 353 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=464244, ups=1.07, wpb=434810, bsz=16415.3, num_updates=44200, lr=0.000300828, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=43573 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 453 / 1689 loss=4.124, nll_loss=2.499, ppl=5.65, wps=462666, ups=1.07, wpb=433007, bsz=16434, num_updates=44300, lr=0.000300489, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=43666 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 553 / 1689 loss=4.119, nll_loss=2.493, ppl=5.63, wps=462320, ups=1.07, wpb=432534, bsz=16443.8, num_updates=44400, lr=0.00030015, gnorm=0.184, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=43760 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 653 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=457854, ups=1.05, wpb=435452, bsz=16382.4, num_updates=44500, lr=0.000299813, gnorm=0.2, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=43855 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 754 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=458636, ups=1.05, wpb=434753, bsz=16590.6, num_updates=44600, lr=0.000299476, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=43950 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 854 / 1689 loss=4.138, nll_loss=2.516, ppl=5.72, wps=459690, ups=1.06, wpb=435024, bsz=16767.3, num_updates=44700, lr=0.000299141, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44044 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 954 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=457082, ups=1.06, wpb=431826, bsz=16253.4, num_updates=44800, lr=0.000298807, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44139 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1054 / 1689 loss=4.131, nll_loss=2.507, ppl=5.69, wps=457014, ups=1.05, wpb=433309, bsz=16851, num_updates=44900, lr=0.000298474, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=44234 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 epoch 027: 1154 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=460206, ups=1.06, wpb=434284, bsz=17034.9, num_updates=45000, lr=0.000298142, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=44328 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027 | valid on 'valid' subset | loss 4.224 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.221 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1254 / 1689 loss=4.126, nll_loss=2.501, ppl=5.66, wps=326793, ups=0.75, wpb=436200, bsz=16387.3, num_updates=45100, lr=0.000297812, gnorm=0.186, clip=0, loss_scale=2, train_wall=116, gb_free=19, wall=44461 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1355 / 1689 loss=4.137, nll_loss=2.514, ppl=5.71, wps=457555, ups=1.05, wpb=434450, bsz=16487.8, num_updates=45200, lr=0.000297482, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44556 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1455 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=459742, ups=1.06, wpb=431725, bsz=16347, num_updates=45300, lr=0.000297154, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44650 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1555 / 1689 loss=4.13, nll_loss=2.507, ppl=5.69, wps=460935, ups=1.07, wpb=431926, bsz=16503.8, num_updates=45400, lr=0.000296826, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44744 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 epoch 027: 1655 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=460235, ups=1.06, wpb=432777, bsz=16632.9, num_updates=45500, lr=0.0002965, gnorm=0.197, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=44838 end of epoch 27 (average epoch stats below) epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 epoch 027 | loss 4.129 | nll_loss 2.505 | ppl 5.68 | wps 445515 | ups 1.03 | wpb 433528 | bsz 16504.8 | num_updates 45534 | lr 0.000296389 | gnorm 0.196 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.3 | wall 44870 Start iterating over samples epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 66 / 1689 loss=4.122, nll_loss=2.497, ppl=5.64, wps=454495, ups=1.05, wpb=430908, bsz=16620.9, num_updates=45600, lr=0.000296174, gnorm=0.198, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=44933 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 167 / 1689 loss=4.106, nll_loss=2.478, ppl=5.57, wps=453148, ups=1.04, wpb=433952, bsz=16466.8, num_updates=45700, lr=0.00029585, gnorm=0.195, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=45029 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 267 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454773, ups=1.05, wpb=433029, bsz=16363.6, num_updates=45800, lr=0.000295527, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=45124 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 367 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=459085, ups=1.06, wpb=434039, bsz=16658.5, num_updates=45900, lr=0.000295205, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=45218 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 epoch 028: 467 / 1689 loss=4.117, nll_loss=2.491, ppl=5.62, wps=456487, ups=1.05, wpb=433660, bsz=16634, num_updates=46000, lr=0.000294884, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45313 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.219 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.219 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 568 / 1689 loss=4.123, nll_loss=2.498, ppl=5.65, wps=372660, ups=0.86, wpb=432575, bsz=16867, num_updates=46100, lr=0.000294564, gnorm=0.191, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=45429 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 668 / 1689 loss=4.121, nll_loss=2.496, ppl=5.64, wps=457477, ups=1.05, wpb=435069, bsz=16617.6, num_updates=46200, lr=0.000294245, gnorm=0.203, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=45525 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 768 / 1689 loss=4.127, nll_loss=2.503, ppl=5.67, wps=459647, ups=1.06, wpb=432716, bsz=16430.2, num_updates=46300, lr=0.000293927, gnorm=0.198, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45619 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 868 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=457713, ups=1.06, wpb=431855, bsz=16452.6, num_updates=46400, lr=0.00029361, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=45713 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 968 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=457355, ups=1.05, wpb=434328, bsz=16371, num_updates=46500, lr=0.000293294, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=45808 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1068 / 1689 loss=4.138, nll_loss=2.515, ppl=5.72, wps=458878, ups=1.06, wpb=432897, bsz=16105.4, num_updates=46600, lr=0.000292979, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45902 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1168 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457022, ups=1.05, wpb=434340, bsz=16379.3, num_updates=46700, lr=0.000292666, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=45997 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1268 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=458021, ups=1.05, wpb=434373, bsz=16303.6, num_updates=46800, lr=0.000292353, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=46092 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1368 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=456542, ups=1.05, wpb=433484, bsz=16614.6, num_updates=46900, lr=0.000292041, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=46187 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 epoch 028: 1468 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457860, ups=1.06, wpb=431948, bsz=16735.3, num_updates=47000, lr=0.00029173, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=46282 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028 | valid on 'valid' subset | loss 4.221 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.219 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1568 / 1689 loss=4.132, nll_loss=2.51, ppl=5.69, wps=407573, ups=0.94, wpb=433734, bsz=16465.8, num_updates=47100, lr=0.00029142, gnorm=0.192, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=46388 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 epoch 028: 1668 / 1689 loss=4.134, nll_loss=2.511, ppl=5.7, wps=460953, ups=1.06, wpb=436173, bsz=16606.6, num_updates=47200, lr=0.000291111, gnorm=0.202, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=46483 end of epoch 28 (average epoch stats below) epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 epoch 028 | loss 4.124 | nll_loss 2.5 | ppl 5.66 | wps 448122 | ups 1.03 | wpb 433519 | bsz 16504 | num_updates 47221 | lr 0.000291047 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1571 | gb_free 20.5 | wall 46502 Start iterating over samples epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 80 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=448899, ups=1.04, wpb=430103, bsz=16374.6, num_updates=47300, lr=0.000290803, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=46578 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 180 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455087, ups=1.05, wpb=434554, bsz=16942.9, num_updates=47400, lr=0.000290496, gnorm=0.2, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=46674 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 280 / 1689 loss=4.108, nll_loss=2.48, ppl=5.58, wps=461057, ups=1.06, wpb=433476, bsz=16349.7, num_updates=47500, lr=0.000290191, gnorm=0.194, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=46768 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 380 / 1689 loss=4.109, nll_loss=2.482, ppl=5.59, wps=454357, ups=1.05, wpb=430883, bsz=16033.3, num_updates=47600, lr=0.000289886, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=46863 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 480 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455240, ups=1.05, wpb=434241, bsz=16794.4, num_updates=47700, lr=0.000289581, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=46958 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 581 / 1689 loss=4.113, nll_loss=2.487, ppl=5.6, wps=451549, ups=1.04, wpb=433075, bsz=16874, num_updates=47800, lr=0.000289278, gnorm=0.196, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=47054 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 681 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=458524, ups=1.06, wpb=432765, bsz=16568.1, num_updates=47900, lr=0.000288976, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=47148 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 epoch 029: 781 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=464774, ups=1.07, wpb=434070, bsz=16409, num_updates=48000, lr=0.000288675, gnorm=0.193, clip=0, loss_scale=2, train_wall=92, gb_free=16.5, wall=47242 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029 | valid on 'valid' subset | loss 4.224 | nll_loss 2.584 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.219 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 881 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=127122, ups=0.29, wpb=433554, bsz=16533.4, num_updates=48100, lr=0.000288375, gnorm=0.188, clip=0, loss_scale=2, train_wall=252, gb_free=18.6, wall=47583 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 981 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459824, ups=1.07, wpb=431079, bsz=16528.1, num_updates=48200, lr=0.000288076, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=47677 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1081 / 1689 loss=4.125, nll_loss=2.5, ppl=5.66, wps=458940, ups=1.06, wpb=434290, bsz=16404.3, num_updates=48300, lr=0.000287777, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=19.8, wall=47771 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1182 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=458391, ups=1.05, wpb=434612, bsz=16743.6, num_updates=48400, lr=0.00028748, gnorm=0.186, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=47866 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1282 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462520, ups=1.06, wpb=435194, bsz=16580.5, num_updates=48500, lr=0.000287183, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=47960 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1382 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462164, ups=1.07, wpb=433219, bsz=16586.4, num_updates=48600, lr=0.000286888, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=48054 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1482 / 1689 loss=4.129, nll_loss=2.505, ppl=5.68, wps=463547, ups=1.06, wpb=436008, bsz=16331, num_updates=48700, lr=0.000286593, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=48148 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1582 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=459755, ups=1.06, wpb=433192, bsz=16258.6, num_updates=48800, lr=0.000286299, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48242 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 epoch 029: 1683 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=454868, ups=1.04, wpb=436931, bsz=16283.3, num_updates=48900, lr=0.000286006, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=48338 end of epoch 29 (average epoch stats below) epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 epoch 029 | loss 4.12 | nll_loss 2.495 | ppl 5.64 | wps 396714 | ups 0.92 | wpb 433526 | bsz 16504.8 | num_updates 48906 | lr 0.000285989 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1730 | gb_free 22.6 | wall 48343 Start iterating over samples epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 epoch 030: 94 / 1689 loss=4.11, nll_loss=2.483, ppl=5.59, wps=454362, ups=1.06, wpb=430294, bsz=16611.8, num_updates=49000, lr=0.000285714, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=48433 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.217 | nll_loss 2.581 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.217 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 194 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=373215, ups=0.86, wpb=433408, bsz=16161.5, num_updates=49100, lr=0.000285423, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=17.5, wall=48549 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 294 / 1689 loss=4.115, nll_loss=2.489, ppl=5.61, wps=461516, ups=1.06, wpb=434119, bsz=16314.8, num_updates=49200, lr=0.000285133, gnorm=0.185, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=48643 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 394 / 1689 loss=4.11, nll_loss=2.484, ppl=5.59, wps=460726, ups=1.06, wpb=433711, bsz=16346.9, num_updates=49300, lr=0.000284844, gnorm=0.191, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=48737 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 495 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456455, ups=1.06, wpb=432546, bsz=16851.5, num_updates=49400, lr=0.000284555, gnorm=0.194, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48832 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 595 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459078, ups=1.06, wpb=432635, bsz=16535.8, num_updates=49500, lr=0.000284268, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=48926 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 695 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=460542, ups=1.06, wpb=433164, bsz=16326.6, num_updates=49600, lr=0.000283981, gnorm=0.187, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=49020 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 795 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=459790, ups=1.06, wpb=434962, bsz=16281.7, num_updates=49700, lr=0.000283695, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49115 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 895 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=457848, ups=1.05, wpb=435191, bsz=16462.1, num_updates=49800, lr=0.00028341, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49210 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 995 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=458764, ups=1.06, wpb=433172, bsz=16575.1, num_updates=49900, lr=0.000283126, gnorm=0.188, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=49304 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 epoch 030: 1095 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=456944, ups=1.06, wpb=431428, bsz=16660.5, num_updates=50000, lr=0.000282843, gnorm=0.184, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=49399 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030 | valid on 'valid' subset | loss 4.227 | nll_loss 2.593 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.217 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1195 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=322135, ups=0.74, wpb=432565, bsz=16911.4, num_updates=50100, lr=0.00028256, gnorm=0.202, clip=0, loss_scale=2, train_wall=116, gb_free=17.3, wall=49533 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1295 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=461827, ups=1.06, wpb=435382, bsz=16281.9, num_updates=50200, lr=0.000282279, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=49627 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1395 / 1689 loss=4.113, nll_loss=2.488, ppl=5.61, wps=455226, ups=1.05, wpb=432769, bsz=16651.8, num_updates=50300, lr=0.000281998, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=49723 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1496 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=450972, ups=1.04, wpb=433452, bsz=16686.5, num_updates=50400, lr=0.000281718, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=49819 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 epoch 030: 1596 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=462636, ups=1.06, wpb=437620, bsz=16530.3, num_updates=50500, lr=0.000281439, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=49913 end of epoch 30 (average epoch stats below) epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 epoch 030 | loss 4.115 | nll_loss 2.49 | ppl 5.62 | wps 441237 | ups 1.02 | wpb 433548 | bsz 16505 | num_updates 50593 | lr 0.00028118 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1591 | gb_free 20.6 | wall 50001 Start iterating over samples epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 7 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=454567, ups=1.06, wpb=430410, bsz=16453.9, num_updates=50600, lr=0.000281161, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=50008 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 107 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=460836, ups=1.07, wpb=431524, bsz=16414.9, num_updates=50700, lr=0.000280883, gnorm=0.199, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=50102 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 207 / 1689 loss=4.094, nll_loss=2.465, ppl=5.52, wps=455591, ups=1.05, wpb=434086, bsz=16644.7, num_updates=50800, lr=0.000280607, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=50197 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 307 / 1689 loss=4.106, nll_loss=2.479, ppl=5.57, wps=460640, ups=1.06, wpb=434616, bsz=16344.1, num_updates=50900, lr=0.000280331, gnorm=0.199, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=50291 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 epoch 031: 408 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=453496, ups=1.05, wpb=432701, bsz=16377.9, num_updates=51000, lr=0.000280056, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=50387 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031 | valid on 'valid' subset | loss 4.234 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.217 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 508 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=405032, ups=0.93, wpb=435001, bsz=16788.8, num_updates=51100, lr=0.000279782, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=50494 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 608 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=459155, ups=1.06, wpb=433910, bsz=16527.1, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=50589 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 708 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=458856, ups=1.05, wpb=435202, bsz=16577.9, num_updates=51300, lr=0.000279236, gnorm=0.195, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=50683 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 809 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=456059, ups=1.05, wpb=435830, bsz=16220, num_updates=51400, lr=0.000278964, gnorm=0.19, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=50779 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 909 / 1689 loss=4.119, nll_loss=2.494, ppl=5.63, wps=458283, ups=1.05, wpb=435356, bsz=16359.5, num_updates=51500, lr=0.000278693, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=50874 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1009 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=453194, ups=1.05, wpb=431060, bsz=16825.2, num_updates=51600, lr=0.000278423, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=50969 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1109 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=455747, ups=1.05, wpb=432398, bsz=15988.1, num_updates=51700, lr=0.000278154, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=51064 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1209 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=457707, ups=1.06, wpb=433520, bsz=16700.9, num_updates=51800, lr=0.000277885, gnorm=0.186, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=51159 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1309 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459081, ups=1.06, wpb=433012, bsz=16257.8, num_updates=51900, lr=0.000277617, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=51253 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 epoch 031: 1410 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=455115, ups=1.05, wpb=434354, bsz=16865.7, num_updates=52000, lr=0.00027735, gnorm=0.189, clip=0, loss_scale=1, train_wall=94, gb_free=20.1, wall=51348 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031 | valid on 'valid' subset | loss 4.216 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.216 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1510 / 1689 loss=4.117, nll_loss=2.493, ppl=5.63, wps=384278, ups=0.89, wpb=433889, bsz=16920, num_updates=52100, lr=0.000277084, gnorm=0.188, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=51461 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 epoch 031: 1610 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=462567, ups=1.07, wpb=434258, bsz=16317.4, num_updates=52200, lr=0.000276818, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=51555 end of epoch 31 (average epoch stats below) epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 epoch 031 | loss 4.111 | nll_loss 2.486 | ppl 5.6 | wps 448941 | ups 1.04 | wpb 433528 | bsz 16506.8 | num_updates 52279 | lr 0.000276609 | gnorm 0.195 | clip 0 | loss_scale 1 | train_wall 1574 | gb_free 20.3 | wall 51629 Start iterating over samples epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 21 / 1689 loss=4.105, nll_loss=2.479, ppl=5.58, wps=454674, ups=1.06, wpb=428884, bsz=16530.5, num_updates=52300, lr=0.000276553, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=51650 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 121 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=455823, ups=1.05, wpb=433056, bsz=16790.9, num_updates=52400, lr=0.000276289, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=51745 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 221 / 1689 loss=4.091, nll_loss=2.462, ppl=5.51, wps=454989, ups=1.05, wpb=433147, bsz=16419.7, num_updates=52500, lr=0.000276026, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=51840 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 321 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=458633, ups=1.06, wpb=432538, bsz=16593.8, num_updates=52600, lr=0.000275764, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=51934 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 421 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=454984, ups=1.05, wpb=435136, bsz=16619.6, num_updates=52700, lr=0.000275502, gnorm=0.197, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=52030 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 521 / 1689 loss=4.102, nll_loss=2.475, ppl=5.56, wps=455009, ups=1.05, wpb=433880, bsz=16742.1, num_updates=52800, lr=0.000275241, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=18.6, wall=52125 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 621 / 1689 loss=4.099, nll_loss=2.471, ppl=5.54, wps=452971, ups=1.05, wpb=432208, bsz=16431.7, num_updates=52900, lr=0.000274981, gnorm=0.204, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=52220 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 epoch 032: 721 / 1689 loss=4.107, nll_loss=2.48, ppl=5.58, wps=453813, ups=1.05, wpb=433494, bsz=16612.2, num_updates=53000, lr=0.000274721, gnorm=0.194, clip=0, loss_scale=4, train_wall=94, gb_free=18.2, wall=52316 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032 | valid on 'valid' subset | loss 4.221 | nll_loss 2.583 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.216 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 823 / 1689 loss=4.111, nll_loss=2.485, ppl=5.6, wps=343526, ups=0.79, wpb=435837, bsz=15929.6, num_updates=53100, lr=0.000274462, gnorm=0.195, clip=0, loss_scale=1, train_wall=107, gb_free=18.4, wall=52443 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 923 / 1689 loss=4.114, nll_loss=2.488, ppl=5.61, wps=459112, ups=1.06, wpb=433552, bsz=16541, num_updates=53200, lr=0.000274204, gnorm=0.187, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=52537 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1023 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=455269, ups=1.05, wpb=432986, bsz=16045.8, num_updates=53300, lr=0.000273947, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=52632 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1123 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=457603, ups=1.05, wpb=435090, bsz=16561.2, num_updates=53400, lr=0.00027369, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=17.3, wall=52728 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1223 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=459590, ups=1.06, wpb=433923, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.195, clip=0, loss_scale=1, train_wall=93, gb_free=17.2, wall=52822 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1323 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=453980, ups=1.05, wpb=432532, bsz=16187.1, num_updates=53600, lr=0.000273179, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=52917 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1423 / 1689 loss=4.115, nll_loss=2.491, ppl=5.62, wps=456283, ups=1.05, wpb=433508, bsz=16847, num_updates=53700, lr=0.000272925, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=53012 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1523 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=459648, ups=1.06, wpb=435367, bsz=16658.2, num_updates=53800, lr=0.000272671, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=53107 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 epoch 032: 1624 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=455492, ups=1.05, wpb=432779, bsz=16514.6, num_updates=53900, lr=0.000272418, gnorm=0.186, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=53202 end of epoch 32 (average epoch stats below) epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 epoch 032 | loss 4.107 | nll_loss 2.481 | ppl 5.58 | wps 447328 | ups 1.03 | wpb 433512 | bsz 16501.8 | num_updates 53965 | lr 0.000272254 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1589 | gb_free 21.7 | wall 53263 Start iterating over samples epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 epoch 033: 35 / 1689 loss=4.12, nll_loss=2.495, ppl=5.64, wps=456425, ups=1.06, wpb=431127, bsz=16449.2, num_updates=54000, lr=0.000272166, gnorm=0.2, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=53296 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.215 | nll_loss 2.577 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.215 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 135 / 1689 loss=4.085, nll_loss=2.455, ppl=5.48, wps=378082, ups=0.87, wpb=432691, bsz=16541.9, num_updates=54100, lr=0.000271914, gnorm=0.192, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=53411 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 235 / 1689 loss=4.103, nll_loss=2.476, ppl=5.56, wps=459886, ups=1.06, wpb=433060, bsz=16838.6, num_updates=54200, lr=0.000271663, gnorm=0.183, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=53505 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 335 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=459603, ups=1.06, wpb=434118, bsz=16078.7, num_updates=54300, lr=0.000271413, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=53599 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 435 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456287, ups=1.05, wpb=432593, bsz=16471.4, num_updates=54400, lr=0.000271163, gnorm=0.183, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=53694 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 535 / 1689 loss=4.111, nll_loss=2.486, ppl=5.6, wps=462677, ups=1.06, wpb=434854, bsz=16533.1, num_updates=54500, lr=0.000270914, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=53788 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 635 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=460268, ups=1.06, wpb=433496, bsz=16819.6, num_updates=54600, lr=0.000270666, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=53882 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 735 / 1689 loss=4.09, nll_loss=2.461, ppl=5.51, wps=458377, ups=1.06, wpb=433005, bsz=16415.8, num_updates=54700, lr=0.000270418, gnorm=0.198, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=53977 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 835 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459464, ups=1.06, wpb=434246, bsz=16773, num_updates=54800, lr=0.000270172, gnorm=0.201, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=54071 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 935 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=458083, ups=1.06, wpb=433296, bsz=15991.7, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=54166 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 epoch 033: 1036 / 1689 loss=4.108, nll_loss=2.482, ppl=5.59, wps=452998, ups=1.05, wpb=433091, bsz=16480.4, num_updates=55000, lr=0.00026968, gnorm=0.192, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=54262 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033 | valid on 'valid' subset | loss 4.225 | nll_loss 2.593 | ppl 6.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.215 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1136 / 1689 loss=4.112, nll_loss=2.486, ppl=5.6, wps=355881, ups=0.82, wpb=434198, bsz=16522.6, num_updates=55100, lr=0.000269435, gnorm=0.195, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=54384 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1236 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=463252, ups=1.07, wpb=433642, bsz=17214.2, num_updates=55200, lr=0.000269191, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=54477 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1336 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=462592, ups=1.07, wpb=432888, bsz=16417.4, num_updates=55300, lr=0.000268947, gnorm=0.203, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=54571 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1436 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=459653, ups=1.06, wpb=433723, bsz=16798.7, num_updates=55400, lr=0.000268705, gnorm=0.192, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=54665 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1536 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=459634, ups=1.06, wpb=434035, bsz=16562.9, num_updates=55500, lr=0.000268462, gnorm=0.196, clip=0, loss_scale=4, train_wall=93, gb_free=19.8, wall=54760 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 epoch 033: 1637 / 1689 loss=4.117, nll_loss=2.492, ppl=5.63, wps=456559, ups=1.05, wpb=435762, bsz=16150.8, num_updates=55600, lr=0.000268221, gnorm=0.185, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=54855 end of epoch 33 (average epoch stats below) epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 epoch 033 | loss 4.104 | nll_loss 2.477 | ppl 5.57 | wps 445629 | ups 1.03 | wpb 433528 | bsz 16505.3 | num_updates 55652 | lr 0.000268096 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 21.5 | wall 54904 Start iterating over samples epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 48 / 1689 loss=4.098, nll_loss=2.47, ppl=5.54, wps=291828, ups=0.68, wpb=431704, bsz=16171.7, num_updates=55700, lr=0.00026798, gnorm=0.191, clip=0, loss_scale=2, train_wall=115, gb_free=19.1, wall=55003 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 149 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=453791, ups=1.05, wpb=434235, bsz=16925.6, num_updates=55800, lr=0.00026774, gnorm=0.195, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=55099 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 250 / 1689 loss=4.084, nll_loss=2.455, ppl=5.48, wps=459900, ups=1.06, wpb=432346, bsz=16144.6, num_updates=55900, lr=0.0002675, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=55193 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 epoch 034: 350 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=457248, ups=1.06, wpb=432888, bsz=16300.2, num_updates=56000, lr=0.000267261, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=55287 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.216 | nll_loss 2.58 | ppl 5.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.215 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 450 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=401832, ups=0.93, wpb=433914, bsz=16582.9, num_updates=56100, lr=0.000267023, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=55395 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 550 / 1689 loss=4.092, nll_loss=2.464, ppl=5.52, wps=458546, ups=1.06, wpb=433154, bsz=16582.5, num_updates=56200, lr=0.000266785, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55490 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 650 / 1689 loss=4.104, nll_loss=2.477, ppl=5.57, wps=457856, ups=1.06, wpb=433560, bsz=16666.1, num_updates=56300, lr=0.000266548, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=55585 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 750 / 1689 loss=4.094, nll_loss=2.466, ppl=5.52, wps=457471, ups=1.06, wpb=433304, bsz=16697.2, num_updates=56400, lr=0.000266312, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=55679 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 850 / 1689 loss=4.1, nll_loss=2.473, ppl=5.55, wps=457657, ups=1.06, wpb=433594, bsz=16427.5, num_updates=56500, lr=0.000266076, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=55774 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 950 / 1689 loss=4.1, nll_loss=2.474, ppl=5.55, wps=456873, ups=1.05, wpb=433285, bsz=16601, num_updates=56600, lr=0.000265841, gnorm=0.202, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=55869 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1050 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456734, ups=1.06, wpb=432554, bsz=16183.1, num_updates=56700, lr=0.000265606, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=55964 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1150 / 1689 loss=4.098, nll_loss=2.471, ppl=5.54, wps=454576, ups=1.05, wpb=431595, bsz=16640.4, num_updates=56800, lr=0.000265372, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=56059 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1250 / 1689 loss=4.11, nll_loss=2.485, ppl=5.6, wps=458506, ups=1.06, wpb=433276, bsz=16662.8, num_updates=56900, lr=0.000265139, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=56153 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 epoch 034: 1350 / 1689 loss=4.109, nll_loss=2.484, ppl=5.59, wps=454164, ups=1.04, wpb=434664, bsz=16697, num_updates=57000, lr=0.000264906, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=56249 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034 | valid on 'valid' subset | loss 4.217 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.215 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1450 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=407968, ups=0.94, wpb=434948, bsz=16337, num_updates=57100, lr=0.000264674, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=56355 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1550 / 1689 loss=4.112, nll_loss=2.487, ppl=5.61, wps=462668, ups=1.06, wpb=436546, bsz=16405.1, num_updates=57200, lr=0.000264443, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=56450 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 epoch 034: 1650 / 1689 loss=4.11, nll_loss=2.484, ppl=5.6, wps=462115, ups=1.06, wpb=435342, bsz=16336.1, num_updates=57300, lr=0.000264212, gnorm=0.2, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=56544 end of epoch 34 (average epoch stats below) epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 epoch 034 | loss 4.1 | nll_loss 2.473 | ppl 5.55 | wps 436301 | ups 1.01 | wpb 433533 | bsz 16505.5 | num_updates 57339 | lr 0.000264122 | gnorm 0.194 | clip 0 | loss_scale 2 | train_wall 1590 | gb_free 20.1 | wall 56580 Start iterating over samples epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 61 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=453713, ups=1.05, wpb=431032, bsz=16328.6, num_updates=57400, lr=0.000263982, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56639 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 162 / 1689 loss=4.086, nll_loss=2.457, ppl=5.49, wps=452846, ups=1.05, wpb=432410, bsz=16372.6, num_updates=57500, lr=0.000263752, gnorm=0.181, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=56734 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 262 / 1689 loss=4.085, nll_loss=2.456, ppl=5.49, wps=458914, ups=1.06, wpb=433813, bsz=16626.3, num_updates=57600, lr=0.000263523, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=56829 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 362 / 1689 loss=4.083, nll_loss=2.453, ppl=5.48, wps=460481, ups=1.06, wpb=434435, bsz=16530.8, num_updates=57700, lr=0.000263295, gnorm=0.199, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=56923 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 462 / 1689 loss=4.105, nll_loss=2.478, ppl=5.57, wps=459694, ups=1.06, wpb=435143, bsz=16464.8, num_updates=57800, lr=0.000263067, gnorm=0.193, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=57018 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 562 / 1689 loss=4.089, nll_loss=2.461, ppl=5.51, wps=456915, ups=1.05, wpb=435033, bsz=16343.9, num_updates=57900, lr=0.00026284, gnorm=0.189, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57113 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 epoch 035: 662 / 1689 loss=4.097, nll_loss=2.469, ppl=5.54, wps=458274, ups=1.06, wpb=433846, bsz=16407, num_updates=58000, lr=0.000262613, gnorm=0.197, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=57208 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035 | valid on 'valid' subset | loss 4.223 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.215 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 763 / 1689 loss=4.101, nll_loss=2.474, ppl=5.56, wps=407167, ups=0.93, wpb=436165, bsz=16823.3, num_updates=58100, lr=0.000262387, gnorm=0.188, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=57315 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 863 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=464262, ups=1.08, wpb=431760, bsz=16638.2, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=57408 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 963 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=462480, ups=1.07, wpb=432768, bsz=16621.8, num_updates=58300, lr=0.000261936, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=57502 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1063 / 1689 loss=4.107, nll_loss=2.481, ppl=5.58, wps=465148, ups=1.07, wpb=435238, bsz=16254.6, num_updates=58400, lr=0.000261712, gnorm=0.187, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=57595 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1163 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=462068, ups=1.07, wpb=432414, bsz=16312.6, num_updates=58500, lr=0.000261488, gnorm=0.19, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=57689 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1263 / 1689 loss=4.096, nll_loss=2.469, ppl=5.54, wps=461547, ups=1.07, wpb=430648, bsz=16234.1, num_updates=58600, lr=0.000261265, gnorm=0.19, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=57782 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1364 / 1689 loss=4.094, nll_loss=2.467, ppl=5.53, wps=454482, ups=1.05, wpb=433008, bsz=16378.6, num_updates=58700, lr=0.000261042, gnorm=0.199, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=57877 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1464 / 1689 loss=4.109, nll_loss=2.483, ppl=5.59, wps=456401, ups=1.05, wpb=435487, bsz=16626.8, num_updates=58800, lr=0.00026082, gnorm=0.187, clip=0, loss_scale=2, train_wall=94, gb_free=17.8, wall=57973 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1564 / 1689 loss=4.101, nll_loss=2.475, ppl=5.56, wps=456197, ups=1.05, wpb=434496, bsz=16851, num_updates=58900, lr=0.000260599, gnorm=0.196, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=58068 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 epoch 035: 1664 / 1689 loss=4.107, nll_loss=2.482, ppl=5.59, wps=459243, ups=1.06, wpb=432312, bsz=16710.9, num_updates=59000, lr=0.000260378, gnorm=0.189, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=58162 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 epoch 035 | valid on 'valid' subset | loss 4.211 | nll_loss 2.582 | ppl 5.99 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.211 end of epoch 35 (average epoch stats below) epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 epoch 035 | loss 4.096 | nll_loss 2.469 | ppl 5.54 | wps 442689 | ups 1.02 | wpb 433538 | bsz 16501 | num_updates 59025 | lr 0.000260323 | gnorm 0.191 | clip 0 | loss_scale 2 | train_wall 1576 | gb_free 21.6 | wall 58231 Start iterating over samples epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 75 / 1689 loss=4.089, nll_loss=2.461, ppl=5.5, wps=305231, ups=0.71, wpb=431235, bsz=16576.7, num_updates=59100, lr=0.000260157, gnorm=0.197, clip=0, loss_scale=2, train_wall=99, gb_free=20.1, wall=58303 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 176 / 1689 loss=4.078, nll_loss=2.447, ppl=5.45, wps=463782, ups=1.07, wpb=433965, bsz=16610.8, num_updates=59200, lr=0.000259938, gnorm=0.197, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=58397 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 276 / 1689 loss=4.08, nll_loss=2.45, ppl=5.46, wps=461479, ups=1.07, wpb=432310, bsz=16241.2, num_updates=59300, lr=0.000259718, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=58491 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 376 / 1689 loss=4.083, nll_loss=2.454, ppl=5.48, wps=460768, ups=1.06, wpb=433655, bsz=16620, num_updates=59400, lr=0.0002595, gnorm=0.191, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58585 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 476 / 1689 loss=4.088, nll_loss=2.459, ppl=5.5, wps=459781, ups=1.06, wpb=431837, bsz=16671.2, num_updates=59500, lr=0.000259281, gnorm=0.194, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=58679 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 576 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=454578, ups=1.05, wpb=431282, bsz=16469, num_updates=59600, lr=0.000259064, gnorm=0.193, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=58774 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 676 / 1689 loss=4.089, nll_loss=2.46, ppl=5.5, wps=456742, ups=1.06, wpb=431875, bsz=16179.5, num_updates=59700, lr=0.000258847, gnorm=0.182, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=58868 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 777 / 1689 loss=4.095, nll_loss=2.467, ppl=5.53, wps=456602, ups=1.05, wpb=433522, bsz=16287.7, num_updates=59800, lr=0.00025863, gnorm=0.19, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=58963 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 877 / 1689 loss=4.09, nll_loss=2.462, ppl=5.51, wps=458040, ups=1.05, wpb=434648, bsz=16410.2, num_updates=59900, lr=0.000258414, gnorm=0.205, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=59058 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 epoch 036: 977 / 1689 loss=4.091, nll_loss=2.463, ppl=5.51, wps=459646, ups=1.06, wpb=434046, bsz=16435.5, num_updates=60000, lr=0.000258199, gnorm=0.186, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=59152 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 epoch 036 | valid on 'valid' subset | loss 4.213 | nll_loss 2.579 | ppl 5.97 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.211 end of epoch 36 (average epoch stats below) epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 epoch 036 | loss 4.086 | nll_loss 2.457 | ppl 5.49 | wps 452615 | ups 1.04 | wpb 433142 | bsz 16460.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.192 | clip 0 | loss_scale 2 | train_wall 909 | gb_free 18.5 | wall 59164 done training in 59150.5 seconds